Exemplo n.º 1
0
def define_rnn(rnn_options, audio_conf):

    rnn = DeepSpeech(rnn_hidden_size=800,
                     nb_layers=5,
                     labels=rnn_options['labels'],
                     rnn_type=rnn_options['rnn_type'],
                     audio_conf=audio_conf,
                     bidirectional=True)

    parameters = rnn.parameters()

    return (rnn, parameters)
Exemplo n.º 2
0
def main():
    global args, train_logger, test_logger
    args = options.parse_args()
    os.makedirs(args.log_dir)
    test_logger = Logger(os.path.join(args.log_dir, 'test.log'))
    with open(os.path.join(args.log_dir, 'config.log'), 'w') as f:
        f.write(args.config_str)
    if not args.evaluate:
        os.makedirs(args.checkpoint_dir)
        train_logger = Logger(os.path.join(args.log_dir, 'train.log'))
    loss_results, cer_results = torch.FloatTensor(
        args.epochs), torch.FloatTensor(args.epochs)

    if args.visdom:
        from visdom import Visdom
        viz = Visdom()
        opts = dict(title=args.experiment_id,
                    ylabel='',
                    xlabel='Epoch',
                    legend=['Loss', 'CER'])
        viz_windows = None
        epochs = torch.arange(0, args.epochs)

    if args.resume:
        print('Loading checkpoint model %s' % args.resume)
        checkpoint = torch.load(args.resume)
        model = DeepSpeech.load_model_checkpoint(checkpoint)
        model = torch.nn.DataParallel(model,
                                      device_ids=[i for i in range(args.nGPU)
                                                  ]).cuda()
        labels = DeepSpeech.get_labels(model)
        audio_conf = DeepSpeech.get_audio_conf(model)
        parameters = model.parameters()
        optimizer = torch.optim.SGD(parameters,
                                    lr=args.lr,
                                    momentum=args.momentum,
                                    nesterov=True)
        optimizer.load_state_dict(checkpoint['optimizer'])
        start_epoch = int(checkpoint.get('epoch',
                                         0))  # Index start at 0 for training
        loss_results, cer_results = checkpoint['loss_results'], checkpoint[
            'cer_results']
        if args.epochs > loss_results.numel():
            loss_results.resize_(args.epochs)
            cer_results.resize_(args.epochs)
            loss_results[start_epoch:].zero_()
            cer_results[start_epoch:].zero_()
        # Add previous scores to visdom graph
        if args.visdom and loss_results is not None:
            x_axis = epochs[0:start_epoch]
            y_axis = torch.stack(
                (loss_results[0:start_epoch], cer_results[0:start_epoch]),
                dim=1)
            viz_window = viz.line(
                X=x_axis,
                Y=y_axis,
                opts=opts,
            )
    else:
        start_epoch = args.start_epoch
        with open(args.labels_path) as label_file:
            labels = str(''.join(json.load(label_file)))

        audio_conf = dict(sample_rate=args.sample_rate,
                          window_size=args.window_size,
                          window_stride=args.window_stride,
                          window=args.window,
                          noise_dir=args.noise_dir,
                          noise_prob=args.noise_prob,
                          noise_levels=(args.noise_min, args.noise_max))
        model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                           nb_layers=args.hidden_layers,
                           labels=labels,
                           rnn_type=supported_rnns[args.rnn_type],
                           audio_conf=audio_conf,
                           bidirectional=not args.look_ahead)
        model = torch.nn.DataParallel(model,
                                      device_ids=[i for i in range(args.nGPU)
                                                  ]).cuda()
        parameters = model.parameters()
        optimizer = torch.optim.SGD(parameters,
                                    lr=args.lr,
                                    momentum=args.momentum,
                                    nesterov=True)

    # define loss function (criterion) and decoder
    best_cer = None
    criterion = CTCLoss()
    decoder = GreedyDecoder(labels)

    # define dataloader
    if not args.evaluate:
        train_dataset = SpectrogramDataset(
            audio_conf=audio_conf,
            manifest_filepath=args.train_manifest,
            labels=labels,
            normalize=True,
            augment=args.augment)
        train_sampler = BucketingSampler(train_dataset,
                                         batch_size=args.batch_size)
        train_loader = AudioDataLoader(train_dataset,
                                       num_workers=args.num_workers,
                                       batch_sampler=train_sampler)
        if not args.in_order and start_epoch != 0:
            print("Shuffling batches for the following epochs")
            train_sampler.shuffle()
    val_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                     manifest_filepath=args.val_manifest,
                                     labels=labels,
                                     normalize=True,
                                     augment=False)
    val_loader = AudioDataLoader(val_dataset,
                                 batch_size=args.batch_size,
                                 num_workers=args.num_workers)

    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    if args.evaluate:
        validate(val_loader, model, decoder, 0)
        return

    for epoch in range(start_epoch, args.epochs):
        avg_loss = train(train_loader, train_sampler, model, criterion,
                         optimizer, epoch)
        cer = validate(val_loader, model, decoder, epoch)

        loss_results[epoch] = avg_loss
        cer_results[epoch] = cer

        adjust_learning_rate(optimizer)

        is_best = False
        if best_cer is None or best_cer > cer:
            print('Found better validated model')
            best_cer = cer
            is_best = True
        save_checkpoint(
            DeepSpeech.serialize(model,
                                 optimizer=optimizer,
                                 epoch=epoch,
                                 loss_results=loss_results,
                                 cer_results=cer_results), is_best, epoch)

        if not args.in_order:
            print("Shuffling batches...")
            train_sampler.shuffle()

        if args.visdom:
            x_axis = epochs[0:epoch + 1]
            y_axis = torch.stack(
                (loss_results[0:epoch + 1], cer_results[0:epoch + 1]), dim=1)
            if viz_window is None:
                viz_window = viz.line(
                    X=x_axis,
                    Y=y_axis,
                    opts=opts,
                )
            else:
                viz.line(
                    X=x_axis.unsqueeze(0).expand(y_axis.size(1),
                                                 x_axis.size(0)).transpose(
                                                     0, 1),  # Visdom fix
                    Y=y_axis,
                    win=viz_window,
                    update='replace',
                )
Exemplo n.º 3
0
def main():
    args = parser.parse_args()
    save_folder = args.save_folder

    if args.visdom:
        from visdom import Visdom
        viz = Visdom()

        opts = [
            dict(title='Loss', ylabel='Loss', xlabel='Epoch'),
            dict(title='WER', ylabel='WER', xlabel='Epoch'),
            dict(title='CER', ylabel='CER', xlabel='Epoch')
        ]

        viz_windows = [None, None, None]
        loss_results, cer_results, wer_results = torch.Tensor(
            args.epochs), torch.Tensor(args.epochs), torch.Tensor(args.epochs)
        epochs = torch.range(1, args.epochs)

    try:
        os.makedirs(save_folder)
    except OSError as e:
        if e.errno == errno.EEXIST:
            print('Directory already exists.')
        else:
            raise
    criterion = CTCLoss()

    with open(args.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))

    audio_conf = dict(sample_rate=args.sample_rate,
                      window_size=args.window_size,
                      window_stride=args.window_stride,
                      window=args.window)

    train_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                       manifest_filepath=args.train_manifest,
                                       labels=labels,
                                       normalize=True)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=args.val_manifest,
                                      labels=labels,
                                      normalize=True)
    train_loader = AudioDataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers)

    model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                       nb_layers=args.hidden_layers,
                       num_classes=len(labels))
    decoder = ArgMaxDecoder(labels)
    if args.cuda:
        model = torch.nn.DataParallel(model).cuda()
    print(model)
    parameters = model.parameters()
    optimizer = torch.optim.SGD(parameters,
                                lr=args.lr,
                                momentum=args.momentum,
                                nesterov=True)

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()

    for epoch in range(args.epochs):
        model.train()
        end = time.time()
        avg_loss = 0
        for i, (data) in enumerate(train_loader):
            inputs, targets, input_percentages, target_sizes = data
            # measure data loading time
            data_time.update(time.time() - end)
            inputs = Variable(inputs)
            target_sizes = Variable(target_sizes)
            targets = Variable(targets)

            if args.cuda:
                inputs = inputs.cuda()

            out = model(inputs)
            out = out.transpose(0, 1)  # TxNxH

            seq_length = out.size(0)
            sizes = Variable(input_percentages.mul_(int(seq_length)).int())

            loss = criterion(out, targets, sizes, target_sizes)
            loss = loss / inputs.size(0)  # average the loss by minibatch

            loss_sum = loss.data.sum()
            inf = float("inf")
            if loss_sum == inf or loss_sum == -inf:
                print("WARNING: received an inf loss, setting loss value to 0")
                loss_value = 0
            else:
                loss_value = loss.data[0]

            avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            # compute gradient
            optimizer.zero_grad()
            loss.backward()

            torch.nn.utils.clip_grad_norm(model.parameters(), args.max_norm)
            # SGD step
            optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            if not args.silent:
                print('Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                          (epoch + 1), (i + 1),
                          len(train_loader),
                          batch_time=batch_time,
                          data_time=data_time,
                          loss=losses))

        avg_loss /= len(train_loader)
        print('Training Summary Epoch: [{0}]\t'
              'Average Loss {loss:.3f}\t'.format(epoch + 1, loss=avg_loss))

        total_cer, total_wer = 0, 0
        for i, (data) in enumerate(test_loader):  # test
            inputs, targets, input_percentages, target_sizes = data

            inputs = Variable(inputs)

            # unflatten targets
            split_targets = []
            offset = 0
            for size in target_sizes:
                split_targets.append(targets[offset:offset + size])
                offset += size

            if args.cuda:
                inputs = inputs.cuda()

            out = model(inputs)
            out = out.transpose(0, 1)  # TxNxH
            seq_length = out.size(0)
            sizes = Variable(input_percentages.mul_(int(seq_length)).int())

            decoded_output = decoder.decode(out.data, sizes)
            target_strings = decoder.process_strings(
                decoder.convert_to_strings(split_targets))
            wer, cer = 0, 0
            for x in range(len(target_strings)):
                wer += decoder.wer(decoded_output[x],
                                   target_strings[x]) / float(
                                       len(target_strings[x].split()))
                cer += decoder.cer(decoded_output[x],
                                   target_strings[x]) / float(
                                       len(target_strings[x]))
            total_cer += cer
            total_wer += wer

        wer = total_wer / len(test_loader.dataset)
        cer = total_cer / len(test_loader.dataset)
        wer *= 100
        cer *= 100

        print('Validation Summary Epoch: [{0}]\t'
              'Average WER {wer:.0f}\t'
              'Average CER {cer:.0f}\t'.format(epoch + 1, wer=wer, cer=cer))

        if args.visdom:
            loss_results[epoch] = avg_loss
            wer_results[epoch] = wer
            cer_results[epoch] = cer
            epoch += 1
            x_axis = epochs[0:epoch]
            y_axis = [
                loss_results[0:epoch], wer_results[0:epoch],
                cer_results[0:epoch]
            ]
            for x in range(len(viz_windows)):
                if viz_windows[x] is None:
                    viz_windows[x] = viz.line(
                        X=x_axis,
                        Y=y_axis[x],
                        opts=opts[x],
                    )
                else:
                    viz.line(
                        X=x_axis,
                        Y=y_axis[x],
                        win=viz_windows[x],
                        update='replace',
                    )
        if args.epoch_save:
            file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch)
            torch.save(checkpoint(model, args, len(labels), epoch), file_path)
    torch.save(checkpoint(model, args, len(labels)), args.final_model_path)
Exemplo n.º 4
0
def train_main(args):
    args.distributed = args.world_size > 1
    main_proc = True
    if args.distributed:
        if args.gpu_rank:
            torch.cuda.set_device(int(args.gpu_rank))
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
        main_proc = args.rank == 0  # Only the first proc should save models
    save_folder = args.save_folder

    loss_results, cer_results, wer_results = torch.Tensor(args.epochs), torch.Tensor(args.epochs), torch.Tensor(
        args.epochs)
    best_wer = None
    if args.visdom and main_proc:
        from visdom import Visdom

        viz = Visdom()
        opts = dict(title=args.id, ylabel='', xlabel='Epoch', legend=['Loss', 'WER', 'CER'])
        viz_window = None
        epochs = torch.arange(1, args.epochs + 1)
    if args.tensorboard and main_proc:
        os.makedirs(args.log_dir, exist_ok=True)
        from tensorboardX import SummaryWriter

        tensorboard_writer = SummaryWriter(args.log_dir)
    os.makedirs(save_folder, exist_ok=True)

    avg_loss, start_epoch, start_iter = 0, 0, 0
    if args.continue_from:  # Starting from previous model
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from, map_location=lambda storage, loc: storage)
        model = DeepSpeech.load_model_package(package)
        labels = DeepSpeech.get_labels(model)
        audio_conf = DeepSpeech.get_audio_conf(model)
        parameters = model.parameters()
        optimizer = torch.optim.SGD(parameters, lr=args.lr,
                                    momentum=args.momentum, nesterov=True)
        if not args.finetune:  # Don't want to restart training
            if args.cuda:
                model.cuda()
            optimizer.load_state_dict(package['optim_dict'])
            start_epoch = int(package.get('epoch', 1)) - 1  # Index start at 0 for training
            start_iter = package.get('iteration', None)
            if start_iter is None:
                start_epoch += 1  # We saved model after epoch finished, start at the next epoch.
                start_iter = 0
            else:
                start_iter += 1
            avg_loss = int(package.get('avg_loss', 0))
            loss_results, cer_results, wer_results = package['loss_results'], package[
                'cer_results'], package['wer_results']
            if main_proc and args.visdom and \
                            package[
                                'loss_results'] is not None and start_epoch > 0:  # Add previous scores to visdom graph
                x_axis = epochs[0:start_epoch]
                y_axis = torch.stack(
                    (loss_results[0:start_epoch], wer_results[0:start_epoch], cer_results[0:start_epoch]),
                    dim=1)
                viz_window = viz.line(
                    X=x_axis,
                    Y=y_axis,
                    opts=opts,
                )
            if main_proc and args.tensorboard and \
                            package[
                                'loss_results'] is not None and start_epoch > 0:  # Previous scores to tensorboard logs
                for i in range(start_epoch):
                    values = {
                        'Avg Train Loss': loss_results[i],
                        'Avg WER': wer_results[i],
                        'Avg CER': cer_results[i]
                    }
                    tensorboard_writer.add_scalars(args.id, values, i + 1)
    else:
        with open(args.labels_path) as label_file:
            labels = str(''.join(json.load(label_file)))

        audio_conf = dict(sample_rate=args.sample_rate,
                          window_size=args.window_size,
                          window_stride=args.window_stride,
                          window=args.window,
                          noise_dir=args.noise_dir,
                          noise_prob=args.noise_prob,
                          noise_levels=(args.noise_min, args.noise_max))

        rnn_type = args.rnn_type.lower()
        assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
        model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                           nb_layers=args.hidden_layers,
                           labels=labels,
                           rnn_type=supported_rnns[rnn_type],
                           audio_conf=audio_conf,
                           bidirectional=args.bidirectional)
        parameters = model.parameters()
        optimizer = torch.optim.SGD(parameters, lr=args.lr,
                                    momentum=args.momentum, nesterov=True)
    criterion = CTCLoss()
    decoder = GreedyDecoder(labels)
    train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels,
                                       normalize=True, augment=args.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels,
                                      normalize=True, augment=False)
    if not args.distributed:
        train_sampler = BucketingSampler(train_dataset, batch_size=args.batch_size)
    else:
        train_sampler = DistributedBucketingSampler(train_dataset, batch_size=args.batch_size,
                                                    num_replicas=args.world_size, rank=args.rank)
    train_loader = AudioDataLoader(train_dataset,
                                   num_workers=args.num_workers, batch_sampler=train_sampler)
    test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size,
                                  num_workers=args.num_workers)

    if (not args.no_shuffle and start_epoch != 0) or args.no_sorta_grad:
        print("Shuffling batches for the following epochs")
        train_sampler.shuffle(start_epoch)

    if args.cuda:
        model.cuda()
        if args.distributed:
            model = torch.nn.parallel.DistributedDataParallel(model,
                                                              device_ids=(int(args.gpu_rank),) if args.rank else None)

    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()

    for epoch in range(start_epoch, args.epochs):
        model.train()
        end = time.time()
        start_epoch_time = time.time()
        for i, (data) in enumerate(train_loader, start=start_iter):
            if i == len(train_sampler):
                break
            inputs, targets, input_percentages, target_sizes = data
            input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
            # measure data loading time
            data_time.update(time.time() - end)

            if args.cuda:
                inputs = inputs.cuda()

            out, output_sizes = model(inputs, input_sizes)
            out = out.transpose(0, 1)  # TxNxH

            loss = criterion(out, targets, output_sizes, target_sizes)
            loss = loss / inputs.size(0)  # average the loss by minibatch

            inf = float("inf")
            if args.distributed:
                loss_value = reduce_tensor(loss, args.world_size)[0]
            else:
                loss_value = loss.item()
            if loss_value == inf or loss_value == -inf:
                print("WARNING: received an inf loss, setting loss value to 0")
                loss_value = 0

            avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            # compute gradient
            optimizer.zero_grad()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
            # SGD step
            optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            if not args.silent:
                print('Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                    (epoch + 1), (i + 1), len(train_sampler), batch_time=batch_time, data_time=data_time, loss=losses))
            if args.checkpoint_per_batch > 0 and i > 0 and (i + 1) % args.checkpoint_per_batch == 0 and main_proc:
                file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth' % (save_folder, epoch + 1, i + 1)
                print("Saving checkpoint model to %s" % file_path)
                torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, iteration=i,
                                                loss_results=loss_results,
                                                wer_results=wer_results, cer_results=cer_results, avg_loss=avg_loss),
                           file_path)
            del loss
            del out
        avg_loss /= len(train_sampler)

        epoch_time = time.time() - start_epoch_time
        print('Training Summary Epoch: [{0}]\t'
              'Time taken (s): {epoch_time:.0f}\t'
              'Average Loss {loss:.3f}\t'.format(epoch + 1, epoch_time=epoch_time, loss=avg_loss))

        start_iter = 0  # Reset start iteration for next epoch
        total_cer, total_wer = 0, 0
        model.eval()
        with torch.no_grad():
            for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)):
                inputs, targets, input_percentages, target_sizes = data
                input_sizes = input_percentages.mul_(int(inputs.size(3))).int()

                # unflatten targets
                split_targets = []
                offset = 0
                for size in target_sizes:
                    split_targets.append(targets[offset:offset + size])
                    offset += size

                if args.cuda:
                    inputs = inputs.cuda()

                out, output_sizes = model(inputs, input_sizes)

                decoded_output, _ = decoder.decode(out.data, output_sizes)
                target_strings = decoder.convert_to_strings(split_targets)
                wer, cer = 0, 0
                for x in range(len(target_strings)):
                    transcript, reference = decoded_output[x][0], target_strings[x][0]
                    wer += decoder.wer(transcript, reference) / float(len(reference.split()))
                    cer += decoder.cer(transcript, reference) / float(len(reference))
                total_cer += cer
                total_wer += wer
                del out
            wer = total_wer / len(test_loader.dataset)
            cer = total_cer / len(test_loader.dataset)
            wer *= 100
            cer *= 100
            loss_results[epoch] = avg_loss
            wer_results[epoch] = wer
            cer_results[epoch] = cer
            print('Validation Summary Epoch: [{0}]\t'
                  'Average WER {wer:.3f}\t'
                  'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer))

            if args.visdom and main_proc:
                x_axis = epochs[0:epoch + 1]
                y_axis = torch.stack(
                    (loss_results[0:epoch + 1], wer_results[0:epoch + 1], cer_results[0:epoch + 1]), dim=1)
                if viz_window is None:
                    viz_window = viz.line(
                        X=x_axis,
                        Y=y_axis,
                        opts=opts,
                    )
                else:
                    viz.line(
                        X=x_axis.unsqueeze(0).expand(y_axis.size(1), x_axis.size(0)).transpose(0, 1),  # Visdom fix
                        Y=y_axis,
                        win=viz_window,
                        update='replace',
                    )
            if args.tensorboard and main_proc:
                values = {
                    'Avg Train Loss': avg_loss,
                    'Avg WER': wer,
                    'Avg CER': cer
                }
                tensorboard_writer.add_scalars(args.id, values, epoch + 1)
                if args.log_params:
                    for tag, value in model.named_parameters():
                        tag = tag.replace('.', '/')
                        tensorboard_writer.add_histogram(tag, to_np(value), epoch + 1)
                        tensorboard_writer.add_histogram(tag + '/grad', to_np(value.grad), epoch + 1)
            if args.checkpoint and main_proc:
                file_path = '%s/deepspeech_%d.pth' % (save_folder, epoch + 1)
                torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results,
                                                wer_results=wer_results, cer_results=cer_results),
                           file_path)
                # anneal lr
                optim_state = optimizer.state_dict()
                optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0]['lr'] / args.learning_anneal
                optimizer.load_state_dict(optim_state)
                print('Learning rate annealed to: {lr:.6f}'.format(lr=optim_state['param_groups'][0]['lr']))

            if (best_wer is None or best_wer > wer) and main_proc:
                print("Found better validated model, saving to %s" % args.model_path)
                torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results,
                                                wer_results=wer_results, cer_results=cer_results), args.model_path)
                best_wer = wer

                avg_loss = 0
            if not args.no_shuffle:
                print("Shuffling batches...")
                train_sampler.shuffle(epoch)
Exemplo n.º 5
0
def main():
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh':
        print(
            "ERROR: GRU does not currently support activations other than tanh"
        )
        sys.exit()

    if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu':
        print("ERROR: We should be using ReLU RNNs")
        sys.exit()

    print("=======================================================")
    for arg in vars(args):
        print("***%s = %s " % (arg.ljust(25), getattr(args, arg)))
    print("=======================================================")

    save_folder = args.save_folder

    loss_results, cer_results, wer_results = torch.Tensor(
        params.epochs), torch.Tensor(params.epochs), torch.Tensor(
            params.epochs)
    best_wer = None
    try:
        os.makedirs(save_folder)
    except OSError as e:
        if e.errno == errno.EEXIST:
            print('Directory already exists.')
        else:
            raise
    criterion = CTCLoss()

    with open(params.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))
    audio_conf = dict(sample_rate=params.sample_rate,
                      window_size=params.window_size,
                      window_stride=params.window_stride,
                      window=params.window,
                      noise_dir=params.noise_dir,
                      noise_prob=params.noise_prob,
                      noise_levels=(params.noise_min, params.noise_max))

    train_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                       manifest_filepath=params.train_manifest,
                                       labels=labels,
                                       normalize=True,
                                       augment=params.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=params.val_manifest,
                                      labels=labels,
                                      normalize=True,
                                      augment=False)
    train_loader = AudioDataLoader(train_dataset,
                                   batch_size=params.batch_size,
                                   num_workers=1)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=params.batch_size,
                                  num_workers=1)

    rnn_type = params.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"

    model = DeepSpeech(rnn_hidden_size=params.hidden_size,
                       nb_layers=params.hidden_layers,
                       labels=labels,
                       rnn_type=supported_rnns[rnn_type],
                       audio_conf=audio_conf,
                       bidirectional=False,
                       rnn_activation=params.rnn_act_type,
                       bias=params.bias)

    parameters = model.parameters()
    optimizer = torch.optim.SGD(parameters,
                                lr=params.lr,
                                momentum=params.momentum,
                                nesterov=True,
                                weight_decay=params.l2)
    decoder = GreedyDecoder(labels)

    if args.continue_from:
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from)
        model.load_state_dict(package['state_dict'])
        optimizer.load_state_dict(package['optim_dict'])
        start_epoch = int(package.get(
            'epoch', 1)) - 1  # Python index start at 0 for training
        start_iter = package.get('iteration', None)
        if start_iter is None:
            start_epoch += 1  # Assume that we saved a model after an epoch finished, so start at the next epoch.
            start_iter = 0
        else:
            start_iter += 1
        avg_loss = int(package.get('avg_loss', 0))

        if args.start_epoch != -1:
            start_epoch = args.start_epoch

        loss_results[:
                     start_epoch], cer_results[:start_epoch], wer_results[:start_epoch] = package[
                         'loss_results'][:start_epoch], package[
                             'cer_results'][:start_epoch], package[
                                 'wer_results'][:start_epoch]
        print(loss_results)
        epoch = start_epoch

    else:
        avg_loss = 0
        start_epoch = 0
        start_iter = 0
        avg_training_loss = 0
    if params.cuda:
        model = torch.nn.DataParallel(model).cuda()

    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    ctc_time = AverageMeter()

    for epoch in range(start_epoch, params.epochs):
        model.train()
        end = time.time()
        for i, (data) in enumerate(train_loader, start=start_iter):
            if i == len(train_loader):
                break
            inputs, targets, input_percentages, target_sizes = data
            # measure data loading time
            data_time.update(time.time() - end)
            inputs = Variable(inputs, requires_grad=False)
            target_sizes = Variable(target_sizes, requires_grad=False)
            targets = Variable(targets, requires_grad=False)

            if params.cuda:
                inputs = inputs.cuda()

            out = model(inputs)
            out = out.transpose(0, 1)  # TxNxH

            seq_length = out.size(0)
            sizes = Variable(input_percentages.mul_(int(seq_length)).int(),
                             requires_grad=False)

            ctc_start_time = time.time()
            loss = criterion(out, targets, sizes, target_sizes)
            ctc_time.update(time.time() - ctc_start_time)

            loss = loss / inputs.size(0)  # average the loss by minibatch

            loss_sum = loss.data.sum()
            inf = float("inf")
            if loss_sum == inf or loss_sum == -inf:
                print("WARNING: received an inf loss, setting loss value to 0")
                loss_value = 0
            else:
                loss_value = loss.data[0]

            avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            # compute gradient
            optimizer.zero_grad()
            loss.backward()

            torch.nn.utils.clip_grad_norm(model.parameters(), params.max_norm)
            # SGD step
            optimizer.step()

            if params.cuda:
                torch.cuda.synchronize()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'CTC Time {ctc_time.val:.3f} ({ctc_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                      (epoch + 1), (i + 1),
                      len(train_loader),
                      batch_time=batch_time,
                      data_time=data_time,
                      ctc_time=ctc_time,
                      loss=losses))

            del loss
            del out

        avg_loss /= len(train_loader)

        print('Training Summary Epoch: [{0}]\t'
              'Average Loss {loss:.3f}\t'.format(
                  epoch + 1,
                  loss=avg_loss,
              ))

        start_iter = 0  # Reset start iteration for next epoch
        total_cer, total_wer = 0, 0
        model.eval()

        wer, cer = eval_model(model, test_loader, decoder)

        loss_results[epoch] = avg_loss
        wer_results[epoch] = wer
        cer_results[epoch] = cer
        print('Validation Summary Epoch: [{0}]\t'
              'Average WER {wer:.3f}\t'
              'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer))

        if args.checkpoint:
            file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1)
            torch.save(
                DeepSpeech.serialize(model,
                                     optimizer=optimizer,
                                     epoch=epoch,
                                     loss_results=loss_results,
                                     wer_results=wer_results,
                                     cer_results=cer_results), file_path)
        # anneal lr
        optim_state = optimizer.state_dict()
        optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0][
            'lr'] / params.learning_anneal
        optimizer.load_state_dict(optim_state)
        print('Learning rate annealed to: {lr:.6f}'.format(
            lr=optim_state['param_groups'][0]['lr']))

        if best_wer is None or best_wer > wer:
            print("Found better validated model, saving to %s" %
                  args.model_path)
            torch.save(
                DeepSpeech.serialize(model,
                                     optimizer=optimizer,
                                     epoch=epoch,
                                     loss_results=loss_results,
                                     wer_results=wer_results,
                                     cer_results=cer_results), args.model_path)
            best_wer = wer

        avg_loss = 0

        #If set to exit at a given accuracy, exit
        if params.exit_at_acc and (best_wer <= args.acc):
            break

    print("=======================================================")
    print("***Best WER = ", best_wer)
    for arg in vars(args):
        print("***%s = %s " % (arg.ljust(25), getattr(args, arg)))
    print("=======================================================")
Exemplo n.º 6
0
def main():
    args = parser.parse_args()
    torch.set_printoptions(profile="full")
    criterion = nn.CrossEntropyLoss()
    class_accu_reg = tnt.meter.ClassErrorMeter(topk=[1], accuracy=True)
    class_accu_sum = tnt.meter.ClassErrorMeter(topk=[1], accuracy=True)

    audio_conf = dict(sample_rate=args.sample_rate,
                      window_size=args.window_size,
                      window_stride=args.window_stride,
                      window=args.window,
                      noise_dir=args.noise_dir,
                      noise_prob=args.noise_prob,
                      noise_levels=(args.noise_min, args.noise_max))

    train_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                       manifest_filepath=args.train_manifest,
                                       normalize=True,
                                       augment=args.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=args.val_manifest,
                                      normalize=True,
                                      augment=False)
    train_loader = AudioDataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers)

    rnn_type = args.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"

    #print("FIRST LAYER TYPE:\t", args.first_layer_type)
    #print("MFCC TRANSFORM:\t\t", args.mfcc)

    model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                       nb_layers=args.hidden_layers,
                       rnn_type=supported_rnns[rnn_type],
                       audio_conf=audio_conf,
                       bidirectional=True,
                       cnn_features=args.cnn_features,
                       kernel=args.kernel,
                       first_layer_type=args.first_layer_type,
                       stride=args.stride,
                       mfcc=args.mfcc)

    ########
    #print(list(model.rnns.modules()))
    #for rnn in model.rnns.modules():
    #    print(rnn)#.flatten_parameters()
    #def flat_model(model):
    #    for m in model.modules():
    #        if isinstance(m, nn.LSTM):
    #            m.flatten_parameters()
    ########

    parameters = model.parameters()
    optimizer = torch.optim.SGD(parameters,
                                lr=args.lr,
                                momentum=args.momentum,
                                nesterov=True)

    #scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.learning_rate_decay_epochs, gamma=args.learning_rate_decay_rate)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)

    avg_loss = 0
    start_epoch = 0
    start_iter = 0
    best_train_accu_reg = 0
    best_train_accu_sum = 0
    best_test_accu_reg = 0
    best_test_accu_sum = 0
    best_avg_loss = float("inf")  # sys.float_info.max # 1000000
    epoch_70 = None
    epoch_90 = None
    epoch_95 = None
    epoch_99 = None

    utterance_sequence_length = int(args.utterance_miliseconds / 10)

    loss_begin = round(args.crop_begin / (10 * args.stride))
    loss_end = -round(args.crop_end / (10 * args.stride)) or None
    gap = loss_begin
    print("LOSS BEGIN:", loss_begin)
    print("LOSS END:", loss_end)

    if args.cuda:
        model = torch.nn.DataParallel(model).cuda()

    print(model)
    print("Number of parameters: ", DeepSpeech.get_param_size(model))

    batch_time = AverageMeter()
    data_time = AverageMeter()
    #losses = AverageMeter()

    print(args, "\n")

    for epoch in range(start_epoch, args.epochs):
        losses = AverageMeter()
        scheduler.step()
        optim_state_now = optimizer.state_dict()
        print('\nLEARNING RATE: {lr:.6f}'.format(
            lr=optim_state_now['param_groups'][0]['lr']))
        class_accu_reg.reset()
        class_accu_sum.reset()
        model.train()
        end = time.time()

        for i, (data) in enumerate(train_loader, start=start_iter):
            if i == len(train_loader):
                break

            inputs, input_percentages, speaker_labels, mfccs = data

            # measure data loading time
            data_time.update(time.time() - end)
            inputs = Variable(inputs, requires_grad=False)

            ########
            mfccs = Variable(mfccs, requires_grad=False)
            if args.mfcc == "true":
                inputs = mfccs  # <<-- This line makes us to use mfccs...
            #print("INPUTS SIZE:", inputs.size())
            #print("MFCCS SIZE:", mfccs.size())
            ########

            speaker_labels = Variable(speaker_labels, requires_grad=False)
            speaker_labels = speaker_labels.cuda(async=True).long()

            if args.cuda:
                inputs = inputs.cuda()

            ########
            ########
            sizes = inputs.size()
            inputs = inputs.view(sizes[0], sizes[1] * sizes[2],
                                 sizes[3])  # Collapse feature dimension
            #print("INPUTS SIZE: ====>>>>>\t", inputs.size())
            #start = 0
            #duration = 100
            start = random.randint(
                0, int((inputs.size(2) - 1) * (1 - args.sample_proportion)))
            duration = int((inputs.size(2)) * (args.sample_proportion))
            #start = random.randint(0, (inputs.size(3)-1)-utterance_sequence_length)
            #duration = utterance_sequence_length
            utterances = inputs[
                ..., start:start +
                duration]  # <<<<<<====== THIS IS THE MOST IMPORTANT CODE OF THE PROJECT
            #print("UTTERS SIZE: ====>>>>>\t", utterances.size(), start, start+duration)
            out = model(utterances)
            #print("OUTPUT SIZE: ====>>>>>\t", out.size())
            out = out.transpose(0, 1)  # TxNxH
            ########
            ########

            # Prints the output of the model in a sequence of probabilities of char for each audio...
            #torch.set_printoptions(profile="full")
            ####print("OUT: " + str(out.size()), "SPEAKER LABELS:" + str(speaker_labels.size()), "INPUT PERCENTAGES MEAN: " + str(input_percentages.mean()))
            #print(out[:,:,0])
            #print("SPEAKER LABELS: " + str(speaker_labels))
            #print(out[0][0])
            #softmax_output = F.softmax(out).data # This DOES NOT what I want...
            #softmax_output_alt = flex_softmax(out, axis=2).data # This is FINE!!! <<<===
            #print(softmax_output[0][0])
            #print(softmax_output_alt[0][0])
            ####new_out = torch.sum(out, 0)
            ####new_out = torch.sum(out[20:], 0)
            #print(out.size())
            #print(new_out.size())
            #print(out[-1].size())

            class_accu_reg.add(out[round(out.size(0) / 2)].data,
                               speaker_labels.data)
            class_accu_sum.add(
                torch.sum(out[loss_begin:loss_end], 0).data,
                speaker_labels.data)
            #class_accu_reg.add(processed_out.data, processed_speaker_labels.data)

            if args.loss_type == "reg":
                processed_out = out[round(out.size(0) / 2)]
                processed_speaker_labels = speaker_labels
            if args.loss_type == "mult":
                #indices = torch.LongTensor([0,2])
                mult = (round(out.size(0) / 4), round(out.size(0) / 2),
                        round(3 * out.size(0) / 4))
                processed_out = out.contiguous()[mult, ...].view(-1, 48)
                processed_speaker_labels = speaker_labels.repeat(
                    out.size(0), 1)[mult, ...].view(-1)
                #processed_out = out.contiguous()[(round(out.size(0)/4),round(out.size(0)/2),round(3*out.size(0)/4)),...].view(-1,48)
                #processed_speaker_labels = speaker_labels.repeat(out.size(0),1)[(round(out.size(0)/4),round(out.size(0)/2),round(3*out.size(0)/4)),...].view(-1)
                #processed_out = out.contiguous()[(loss_begin,round(out.size(0)/2),loss_end),...].view(-1,48)
                #processed_speaker_labels = speaker_labels.repeat(out.size(0),1)[(loss_begin,round(out.size(0)/2),loss_end),...].view(-1)
                ##speaker_labels = speaker_labels.expand(20, out.size(0))
            elif args.loss_type == "sum":
                sum_begin = round(out.size(0) / 2) - round(out.size(0) / 4)
                sum_end = round(out.size(0) / 2) + round(out.size(0) / 4)
                processed_out = torch.sum(out[sum_begin:sum_end], 0)
                processed_speaker_labels = speaker_labels
                #processed_out = torch.sum(out[loss_begin:loss_end], 0)
                #processed_speaker_labels = speaker_labels
                #processed_out = torch.sum(out, 0)
                #processed_speaker_labels = speaker_labels
            elif args.loss_type == "full":
                full_begin = round(out.size(0) / 2) - round(out.size(0) / 4)
                full_end = round(out.size(0) / 2) + round(out.size(0) / 4)
                processed_out = out.contiguous()[full_begin:full_end].view(
                    -1, 48)
                processed_speaker_labels = speaker_labels.repeat(
                    out.size(0), 1)[full_begin:full_end].view(-1)
                ##speaker_labels = speaker_labels.expand(20, out.size(0))
                #processed_out = out.contiguous()[loss_begin:loss_end].view(-1,48)
                #processed_speaker_labels = speaker_labels.repeat(out.size(0),1)[loss_begin:loss_end].view(-1)
                ##speaker_labels = speaker_labels.expand(20, out.size(0))
                #processed_out = out.contiguous().view(-1, 48)
                #processed_speaker_labels = speaker_labels.repeat(out.size(0),1).view(-1)
                ##speaker_labels = speaker_labels.expand(20, out.size(0))
            #print("PROC OUTPUT: ====>>>>>\t" + str(processed_out.size()))
            #print("PROC LABELS: ====>>>>>\t" + str(processed_speaker_labels.size()))

            loss = criterion(processed_out, processed_speaker_labels)
            loss = loss / inputs.size(0)  # average the loss by minibatch
            loss_sum = loss.data.sum()
            inf = float("inf")
            if loss_sum == inf or loss_sum == -inf:
                print("WARNING: received an inf loss, setting loss value to 0")
                loss_value = 0
            else:
                loss_value = loss.data[0]
            avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            #accu_out3 = torch.sum(flex_softmax(out[20:], axis=2), 0)
            #print(classaccu.value()[0], classaccu.value()[1])
            # Cross Entropy Loss for a Sequence (Time Series) of Output?
            #output = output.view(-1,29)
            #target = target.view(-1)
            #criterion = nn.CrossEntropyLoss()
            #loss = criterion(output,target)

            # compute gradient
            optimizer.zero_grad()
            loss.backward()

            #torch.nn.utils.clip_grad_norm(model.parameters(), args.max_norm)

            # SGD step
            optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if not args.silent:
                print('Epoch: [{0}][{1}/{2}]\t'
                      'Loss {loss.val:.8f} ({loss.avg:.8f})\t'
                      'CARR {carr:.2f}\t'
                      'CARS {cars:.2f}\t'.format(
                          (epoch + 1), (i + 1),
                          len(train_loader),
                          batch_time=batch_time,
                          data_time=data_time,
                          loss=losses,
                          carr=class_accu_reg.value()[0],
                          cars=class_accu_sum.value()[0]))

            if args.cuda:
                torch.cuda.synchronize()

            del loss
            del out
            del processed_out
            del speaker_labels
            del processed_speaker_labels

        avg_loss /= len(train_loader)

        if (best_avg_loss > avg_loss): best_avg_loss = avg_loss

        print("\nCURRENT EPOCH AVERAGE LOSS:\t", avg_loss)
        print("\nCURRENT EPOCH TRAINING RESULTS:\t",
              class_accu_reg.value()[0], "\t",
              class_accu_sum.value()[0], "\n")

        if (best_train_accu_reg < class_accu_reg.value()[0]):
            best_train_accu_reg = class_accu_reg.value()[0]
        if (best_train_accu_sum < class_accu_sum.value()[0]):
            best_train_accu_sum = class_accu_sum.value()[0]

        get_70 = (class_accu_reg.value()[0] > 70)
        if ((epoch_70 is None) and (get_70 == True)): epoch_70 = epoch + 1
        get_90 = (class_accu_reg.value()[0] > 90)
        if ((epoch_90 is None) and (get_90 == True)): epoch_90 = epoch + 1
        get_95 = (class_accu_reg.value()[0] > 95)
        if ((epoch_95 is None) and (get_95 == True)): epoch_95 = epoch + 1
        get_99 = (class_accu_reg.value()[0] > 99)
        if ((epoch_99 is None) and (get_99 == True)): epoch_99 = epoch + 1

        start_iter = 0  # Reset start iteration for next epoch
        model.eval()

        class_accu_reg.reset()
        class_accu_sum.reset()

        for i, (data) in enumerate(test_loader):  # test

            inputs, input_percentages, speaker_labels, mfccs = data

            inputs = Variable(inputs, volatile=True)

            ########
            mfccs = Variable(mfccs, requires_grad=False)
            if args.mfcc == "true":
                inputs = mfccs  # <<-- This line makes us to use mfccs...
            #print("INPUTS SIZE:", inputs.size())
            #print("MFCCS SIZE:", mfccs.size())
            ########

            speaker_labels = Variable(speaker_labels, requires_grad=False)
            speaker_labels = speaker_labels.cuda(async=True).long()

            if args.cuda:
                inputs = inputs.cuda()

            ########
            ########
            sizes = inputs.size()
            inputs = inputs.view(sizes[0], sizes[1] * sizes[2],
                                 sizes[3])  # Collapse feature dimension
            #print("INPUTS SIZE: ====>>>>>\t", inputs.size())
            #start = round(inputs.size(2)/2)-40
            #duration = 80
            #start = random.randint(0, int((inputs.size(3)-1)*(1-args.sample_proportion)))
            #duration = int((inputs.size(3))*(args.sample_proportion))
            #start = random.randint(0, (inputs.size(3)-1)-utterance_sequence_length)
            #duration = utterance_sequence_length
            utterances = inputs  #[...,start:start+duration] # <<<<<<====== THIS IS THE MOST IMPORTANT CODE OF THE PROJECT
            #print("UTTERS SIZE: ====>>>>>\t", utterances.size(), start, start+duration)
            out = model(utterances)
            #print("OUTPUT SIZE: ====>>>>>\t", out.size())
            out = out.transpose(0, 1)  # TxNxH
            ########
            ########

            # Prints the output of the model in a sequence of probabilities of char for each audio...
            #torch.set_printoptions(profile="full")
            ########print("OUT: " + str(out.size()), "NEW OUT:" + str(new_out.size()), "SPEAKER LABELS:" + str(speaker_labels.size()), "INPUT PERCENTAGES MEAN: " + str(input_percentages.mean()))
            #print(out[:,:,0])
            #print("SPEAKER LABELS: " + str(speaker_labels))
            #print(out[0][0])
            #softmax_output = F.softmax(out).data # This DOES NOT what I want...
            #softmax_output_alt = flex_softmax(out, axis=2).data # This is FINE!!! <<<===
            #print(softmax_output[0][0])
            #print(softmax_output_alt[0][0])
            ########

            #if args.loss_type == "reg":
            #    processed_out = out[round(out.size(0)/2)]; processed_speaker_labels = speaker_labels
            #elif args.loss_type == "sum" or "full":
            #    #processed_out = torch.sum(out[loss_begin:loss_end], 0); processed_speaker_labels = speaker_labels
            #    processed_out = torch.sum(out, 0); processed_speaker_labels = speaker_labels
            #elif args.loss_type == "full":
            #    #processed_out = out.contiguous()[loss_begin:loss_end].view(-1,48); processed_speaker_labels = speaker_labels.repeat(out.size(0),1)[loss_begin:loss_end].view(-1) #speaker_labels = speaker_labels.expand(20, out.size(0))
            #    processed_out = out.contiguous().view(-1, 48); processed_speaker_labels = speaker_labels.repeat(out.size(0),1).view(-1)  # speaker_labels = speaker_labels.expand(20, out.size(0))
            #print("OUT: " + str(out.size()), "SPEAKER LABELS:" + str(speaker_labels.size()))
            #print("PROC OUTPUT: ====>>>>>\t" + str(processed_out.size()))
            #print("PROC LABELS: ====>>>>>\t" + str(processed_speaker_labels.size()))

            class_accu_reg.add(out[round(out.size(0) / 2)].data,
                               speaker_labels.data)
            class_accu_sum.add(
                torch.sum(out[loss_begin:loss_end], 0).data,
                speaker_labels.data)
            #class_accu_reg.add(processed_out.data, processed_speaker_labels.data)

            print('Validation Summary Epoch: [{0}]\t'
                  'CARR {carr:.2f}\t'
                  'CARS {cars:.2f}\t'.format(epoch + 1,
                                             carr=class_accu_reg.value()[0],
                                             cars=class_accu_sum.value()[0]))

            if args.cuda:
                torch.cuda.synchronize()

            del out

        print("\nCURRENT EPOCH TEST RESULTS:\t",
              class_accu_reg.value()[0], "\t",
              class_accu_sum.value()[0], "\n")

        if (best_test_accu_reg < class_accu_reg.value()[0]):
            best_test_accu_reg = class_accu_reg.value()[0]
        if (best_test_accu_sum < class_accu_sum.value()[0]):
            best_test_accu_sum = class_accu_sum.value()[0]

        print("\nBEST AVERAGE LOSS:\t\t", best_avg_loss)
        print("\nBEST EPOCH TRAINING RESULTS:\t", best_train_accu_reg, "\t",
              best_train_accu_sum)
        print("\nBEST EPOCH TEST RESULTS:\t", best_test_accu_reg, "\t",
              best_test_accu_sum)
        print("\nEPOCHS 70%, 90%, 95%, 99%:\t", epoch_70, "\t", epoch_90, "\t",
              epoch_95, "\t", epoch_99, "\n")

        torch.save(
            DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch),
            args.model_path)

        avg_loss = 0

        if not args.no_bucketing and epoch == 0:
            print("Switching to bucketing sampler for following epochs")
            train_dataset = SpectrogramDatasetWithLength(
                audio_conf=audio_conf,
                manifest_filepath=args.train_manifest,
                normalize=True,
                augment=args.augment)
            sampler = BucketingSampler(train_dataset)
            train_loader.sampler = sampler
                          window_size=args.window_size,
                          window_stride=args.window_stride,
                          window=args.window,
                          noise_dir=args.noise_dir,
                          noise_prob=args.noise_prob,
                          noise_levels=(args.noise_min, args.noise_max))

        rnn_type = args.rnn_type.lower()
        assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
        model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                           nb_layers=args.hidden_layers,
                           labels=labels,
                           rnn_type=supported_rnns[rnn_type],
                           audio_conf=audio_conf,
                           bidirectional=args.bidirectional)
        parameters = model.parameters()
        #        optimizer = torch.optim.SGD(parameters, lr=args.lr,
        #                                    momentum=args.momentum, nesterov=True)

        optimizer = torch.optim.Adam(parameters, lr=args.lr)
    decoder = GreedyDecoder(labels)
    train_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                       manifest_filepath=args.train_manifest,
                                       labels=labels,
                                       normalize=True,
                                       augment=args.augment,
                                       pitch=args.pitch,
                                       whitenoise=args.whitenoise)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=args.val_manifest,
                                      labels=labels,
Exemplo n.º 8
0
def main():
    args = parser.parse_args()
    save_folder = args.save_folder

    loss_results, cer_results, wer_results = torch.Tensor(
        args.epochs), torch.Tensor(args.epochs), torch.Tensor(args.epochs)
    if args.visdom:
        from visdom import Visdom
        viz = Visdom()

        opts = [
            dict(title='Loss', ylabel='Loss', xlabel='Epoch'),
            dict(title='WER', ylabel='WER', xlabel='Epoch'),
            dict(title='CER', ylabel='CER', xlabel='Epoch')
        ]

        viz_windows = [None, None, None]
        epochs = torch.arange(1, args.epochs + 1)
    if args.tensorboard:
        from logger import TensorBoardLogger
        try:
            os.makedirs(args.log_dir)
        except OSError as e:
            if e.errno == errno.EEXIST:
                print('Directory already exists.')
                for file in os.listdir(args.log_dir):
                    file_path = os.path.join(args.log_dir, file)
                    try:
                        if os.path.isfile(file_path):
                            os.unlink(file_path)
                    except Exception as e:
                        raise
            else:
                raise
        logger = TensorBoardLogger(args.log_dir)

    try:
        os.makedirs(save_folder)
    except OSError as e:
        if e.errno == errno.EEXIST:
            print('Directory already exists.')
        else:
            raise
    criterion = CTCLoss()

    with open(args.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))
    audio_conf = dict(sample_rate=args.sample_rate,
                      window_size=args.window_size,
                      window_stride=args.window_stride,
                      window=args.window,
                      noise_dir=args.noise_dir,
                      noise_prob=args.noise_prob,
                      noise_levels=(args.noise_min, args.noise_max))

    train_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                       manifest_filepath=args.train_manifest,
                                       labels=labels,
                                       normalize=True,
                                       augment=args.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=args.val_manifest,
                                      labels=labels,
                                      normalize=True,
                                      augment=False)
    train_loader = AudioDataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers)

    rnn_type = args.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
    model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                       nb_layers=args.hidden_layers,
                       labels=labels,
                       rnn_type=supported_rnns[rnn_type],
                       audio_conf=audio_conf,
                       bidirectional=True)
    parameters = model.parameters()
    optimizer = torch.optim.SGD(parameters,
                                lr=args.lr,
                                momentum=args.momentum,
                                nesterov=True)
    decoder = GreedyDecoder(labels)

    if args.continue_from:
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from)
        model.load_state_dict(package['state_dict'])
        optimizer.load_state_dict(package['optim_dict'])
        start_epoch = int(package.get(
            'epoch', 1)) - 1  # Python index start at 0 for training
        start_iter = package.get('iteration', None)
        if start_iter is None:
            start_epoch += 1  # Assume that we saved a model after an epoch finished, so start at the next epoch.
            start_iter = 0
        else:
            start_iter += 1
        avg_loss = int(package.get('avg_loss', 0))
        loss_results, cer_results, wer_results = package[
            'loss_results'], package['cer_results'], package['wer_results']
        if args.visdom and \
                        package['loss_results'] is not None and start_epoch > 0:  # Add previous scores to visdom graph
            x_axis = epochs[0:start_epoch]
            y_axis = [
                loss_results[0:start_epoch], wer_results[0:start_epoch],
                cer_results[0:start_epoch]
            ]
            for x in range(len(viz_windows)):
                viz_windows[x] = viz.line(
                    X=x_axis,
                    Y=y_axis[x],
                    opts=opts[x],
                )
        if args.tensorboard and \
                        package['loss_results'] is not None and start_epoch > 0:  # Previous scores to tensorboard logs
            for i in range(start_epoch):
                info = {
                    'Avg Train Loss': loss_results[i],
                    'Avg WER': wer_results[i],
                    'Avg CER': cer_results[i]
                }
                for tag, val in info.items():
                    logger.scalar_summary(tag, val, i + 1)
        if not args.no_bucketing:
            print("Using bucketing sampler for the following epochs")
            train_dataset = SpectrogramDatasetWithLength(
                audio_conf=audio_conf,
                manifest_filepath=args.train_manifest,
                labels=labels,
                normalize=True,
                augment=args.augment)
            sampler = BucketingSampler(train_dataset)
            train_loader.sampler = sampler
    else:
        avg_loss = 0
        start_epoch = 0
        start_iter = 0
    if args.cuda:
        model = torch.nn.DataParallel(model).cuda()

    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()

    for epoch in range(start_epoch, args.epochs):
        model.train()
        end = time.time()
        for i, (data) in enumerate(train_loader, start=start_iter):
            if i == len(train_loader):
                break
            inputs, targets, input_percentages, target_sizes = data
            # measure data loading time
            data_time.update(time.time() - end)
            inputs = Variable(inputs, requires_grad=False)
            target_sizes = Variable(target_sizes, requires_grad=False)
            targets = Variable(targets, requires_grad=False)

            if args.cuda:
                inputs = inputs.cuda()

            out = model(inputs)
            out = out.transpose(0, 1)  # TxNxH

            seq_length = out.size(0)
            sizes = Variable(input_percentages.mul_(int(seq_length)).int(),
                             requires_grad=False)

            loss = criterion(out, targets, sizes, target_sizes)
            loss = loss / inputs.size(0)  # average the loss by minibatch

            loss_sum = loss.data.sum()
            inf = float("inf")
            if loss_sum == inf or loss_sum == -inf:
                print("WARNING: received an inf loss, setting loss value to 0")
                loss_value = 0
            else:
                loss_value = loss.data[0]

            avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            # compute gradient
            optimizer.zero_grad()
            loss.backward()

            torch.nn.utils.clip_grad_norm(model.parameters(), args.max_norm)
            # SGD step
            optimizer.step()

            if args.cuda:
                torch.cuda.synchronize()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            if not args.silent:
                print('Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                          (epoch + 1), (i + 1),
                          len(train_loader),
                          batch_time=batch_time,
                          data_time=data_time,
                          loss=losses))
            if args.checkpoint_per_batch > 0 and i > 0 and (
                    i + 1) % args.checkpoint_per_batch == 0:
                file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth.tar' % (
                    save_folder, epoch + 1, i + 1)
                print("Saving checkpoint model to %s" % file_path)
                torch.save(
                    DeepSpeech.serialize(model,
                                         optimizer=optimizer,
                                         epoch=epoch,
                                         iteration=i,
                                         loss_results=loss_results,
                                         wer_results=wer_results,
                                         cer_results=cer_results,
                                         avg_loss=avg_loss), file_path)
            del loss
            del out
        avg_loss /= len(train_loader)

        print('Training Summary Epoch: [{0}]\t'
              'Average Loss {loss:.3f}\t'.format(epoch + 1, loss=avg_loss))

        start_iter = 0  # Reset start iteration for next epoch
        total_cer, total_wer = 0, 0
        model.eval()
        for i, (data) in enumerate(test_loader):  # test
            inputs, targets, input_percentages, target_sizes = data

            inputs = Variable(inputs, volatile=True)

            # unflatten targets
            split_targets = []
            offset = 0
            for size in target_sizes:
                split_targets.append(targets[offset:offset + size])
                offset += size

            if args.cuda:
                inputs = inputs.cuda()

            out = model(inputs)
            out = out.transpose(0, 1)  # TxNxH
            seq_length = out.size(0)
            sizes = input_percentages.mul_(int(seq_length)).int()

            decoded_output = decoder.decode(out.data, sizes)
            target_strings = decoder.process_strings(
                decoder.convert_to_strings(split_targets))
            wer, cer = 0, 0
            for x in range(len(target_strings)):
                wer += decoder.wer(decoded_output[x],
                                   target_strings[x]) / float(
                                       len(target_strings[x].split()))
                cer += decoder.cer(decoded_output[x],
                                   target_strings[x]) / float(
                                       len(target_strings[x]))
            total_cer += cer
            total_wer += wer

            if args.cuda:
                torch.cuda.synchronize()
            del out
        wer = total_wer / len(test_loader.dataset)
        cer = total_cer / len(test_loader.dataset)
        wer *= 100
        cer *= 100
        loss_results[epoch] = avg_loss
        wer_results[epoch] = wer
        cer_results[epoch] = cer
        print('Validation Summary Epoch: [{0}]\t'
              'Average WER {wer:.3f}\t'
              'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer))

        if args.visdom:
            # epoch += 1
            x_axis = epochs[0:epoch + 1]
            y_axis = [
                loss_results[0:epoch + 1], wer_results[0:epoch + 1],
                cer_results[0:epoch + 1]
            ]
            for x in range(len(viz_windows)):
                if viz_windows[x] is None:
                    viz_windows[x] = viz.line(
                        X=x_axis,
                        Y=y_axis[x],
                        opts=opts[x],
                    )
                else:
                    viz.line(
                        X=x_axis,
                        Y=y_axis[x],
                        win=viz_windows[x],
                        update='replace',
                    )
        if args.tensorboard:
            info = {'Avg Train Loss': avg_loss, 'Avg WER': wer, 'Avg CER': cer}
            for tag, val in info.items():
                logger.scalar_summary(tag, val, epoch + 1)
            if args.log_params:
                for tag, value in model.named_parameters():
                    tag = tag.replace('.', '/')
                    logger.histo_summary(tag, to_np(value), epoch + 1)
                    logger.histo_summary(tag + '/grad', to_np(value.grad),
                                         epoch + 1)
        if args.checkpoint:
            file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1)
            torch.save(
                DeepSpeech.serialize(model,
                                     optimizer=optimizer,
                                     epoch=epoch,
                                     loss_results=loss_results,
                                     wer_results=wer_results,
                                     cer_results=cer_results), file_path)
        # anneal lr
        optim_state = optimizer.state_dict()
        optim_state['param_groups'][0][
            'lr'] = optim_state['param_groups'][0]['lr'] / args.learning_anneal
        optimizer.load_state_dict(optim_state)
        print('Learning rate annealed to: {lr:.6f}'.format(
            lr=optim_state['param_groups'][0]['lr']))

        avg_loss = 0
        if not args.no_bucketing and epoch == 0:
            print("Switching to bucketing sampler for following epochs")
            train_dataset = SpectrogramDatasetWithLength(
                audio_conf=audio_conf,
                manifest_filepath=args.train_manifest,
                labels=labels,
                normalize=True,
                augment=args.augment)
            sampler = BucketingSampler(train_dataset)
            train_loader.sampler = sampler

    torch.save(DeepSpeech.serialize(model, optimizer=optimizer),
               args.final_model_path)
Exemplo n.º 9
0
                          window_size=args.window_size,
                          window_stride=args.window_stride,
                          window=args.window,
                          noise_dir=args.noise_dir,
                          noise_prob=args.noise_prob,
                          noise_levels=(args.noise_min, args.noise_max))

        rnn_type = args.rnn_type.lower()
        assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
        model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                           nb_layers=args.hidden_layers,
                           labels=labels,
                           rnn_type=supported_rnns[rnn_type],
                           audio_conf=audio_conf,
                           bidirectional=args.bidirectional)
        parameters = model.parameters()
        optimizer = torch.optim.SGD(parameters, lr=args.lr,
                                    momentum=args.momentum, nesterov=True)
    criterion = CTCLoss()
    decoder = GreedyDecoder(labels)
    train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels,
                                       normalize=True, augment=args.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels,
                                      normalize=True, augment=False)
    if not args.distributed:
        train_sampler = BucketingSampler(train_dataset, batch_size=args.batch_size)
    else:
        train_sampler = DistributedBucketingSampler(train_dataset, batch_size=args.batch_size,
                                                    num_replicas=args.world_size, rank=args.rank)
    train_loader = AudioDataLoader(train_dataset,
                                   num_workers=args.num_workers, batch_sampler=train_sampler)
Exemplo n.º 10
0
def main():
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh':
      print("ERROR: GRU does not currently support activations other than tanh")
      sys.exit()

    if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu':
      print("ERROR: We should be using ReLU RNNs")
      sys.exit()

    print("=======================================================")
    for arg in vars(args):
      print("***%s = %s " %  (arg.ljust(25), getattr(args, arg)))
    print("=======================================================")

    save_folder = args.save_folder

    loss_results, cer_results, wer_results = torch.Tensor(params.epochs), torch.Tensor(params.epochs), torch.Tensor(params.epochs)
    best_wer = None
    try:
        os.makedirs(save_folder)
    except OSError as e:
        if e.errno == errno.EEXIST:
            print('Directory already exists.')
        else:
            raise

    with open(params.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))

    rnn_type = params.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"

    model = DeepSpeech(rnn_hidden_size = params.hidden_size,
                       nb_layers       = params.hidden_layers,
                       labels          = labels,
                       rnn_type        = supported_rnns[rnn_type],
                       audio_conf      = None,
                       bidirectional   = True,
                       rnn_activation  = params.rnn_act_type,
                       bias            = params.bias)

    parameters = model.parameters()
    optimizer = torch.optim.SGD(parameters, lr=params.lr,
                                momentum=params.momentum, nesterov=False,
                                weight_decay = params.l2)
    cuda = torch.device('cuda')
    criterion = torch.nn.CTCLoss(reduction='none').to(cuda)


    avg_loss = 0
    start_epoch = 0
    start_iter = 0
    avg_training_loss = 0
    if params.cuda:
        model.cuda()

    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    ctc_time = AverageMeter()
    forward_time = AverageMeter()
    backward_time = AverageMeter()

    filename = "/scratch-ml00/wang603/deepspeechData/deepspeech_train.pickle"
    batchedData = user_defined_input.Batch(filename)

    def train_one_epoch(epoch):
        avg_loss = 0
        for i in range(batchedData.numBatches):
#            if i == 1: return
            end = time.time()
            inputs, targets, input_percentages, target_sizes = batchedData.batch(last=False)

            # making all inputs Tensor
            inputs = torch.from_numpy(inputs)
            targets = torch.from_numpy(targets)
            input_percentages = torch.from_numpy(input_percentages)
            target_sizes = torch.from_numpy(target_sizes)
            # measure data loading time
            data_time.update(time.time() - end)
            inputs = Variable(inputs, requires_grad=False)
            target_sizes = Variable(target_sizes, requires_grad=False)
            targets = Variable(targets, requires_grad=False)

            if params.cuda:
                inputs = inputs.cuda()

            # measure forward pass time
            forward_start_time = time.time()
            out = model(inputs)
            # out = out.transpose(0, 1)  # TxNxH

            seq_length = out.size(0)
            sizes = Variable(input_percentages.mul_(int(seq_length)).int(), requires_grad=False)

            # measure ctc loss computing time
            ctc_start_time = time.time()
            out = out.log_softmax(2)  #.detach().requires_grad_()
            # print(sizes.shape)
            # print(out.shape)
            loss = criterion(out, targets, sizes, target_sizes)
            ctc_time.update(time.time() - ctc_start_time)

            loss = loss / inputs.size(0)  # average the loss by minibatch

            loss_sum = loss.sum()
            inf = float("inf")
            if loss_sum == inf or loss_sum == -inf:
                print("WARNING: received an inf loss, setting loss value to 0")
                loss_value = 0
            else:
                loss_value = loss_sum.data.item()

            avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            forward_time.update(time.time() - forward_start_time)

            # measure backward pass time
            backward_start_time = time.time()
            # compute gradient
            optimizer.zero_grad()
            loss_sum.backward()

            torch.nn.utils.clip_grad_norm(model.parameters(), params.max_norm)
            # SGD step
            optimizer.step()

            if params.cuda:
                torch.cuda.synchronize()

            backward_time.update(time.time() - backward_start_time)

            # measure elapsed time
            batch_time.update(time.time() - end)

            if (i % 20 == 0):
                print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Forward {forward_time.val:.3f} ({forward_time.avg:.3f})\t'
                  'CTC Time {ctc_time.val:.3f} ({ctc_time.avg:.3f})\t'
                  'Backward {backward_time.val:.3f} ({backward_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                (epoch + 1), (i + 1), batchedData.numBatches, batch_time=batch_time,
                data_time=data_time, forward_time=forward_time, ctc_time=ctc_time,
                backward_time=backward_time, loss=losses))

            del loss
            del out

        avg_loss /= batchedData.numBatches #  len(train_loader)

        print('Training Summary Epoch: [{0}]\t'
            'Average Loss {loss:.3f}\t'

            .format(epoch + 1, loss=avg_loss, ))

        return avg_loss

    model.train()
    loss_save = []
    time_save = []
    for epoch in range(start_epoch, args.epochs):
        startTime = time.time()
        loss_save.append(train_one_epoch(epoch))
        endTime = time.time()
        time_save.append(endTime - startTime)
        print("epoch {} used {} seconds".format(epoch, endTime - startTime))

    time_save.sort()
    median_time = time_save[int(args.epochs / 2)]
    with open(args.write_to, "w") as f:
        f.write("unit: " + "1 epoch\n")
        for loss in loss_save:
            f.write("{}\n".format(loss))
        f.write("run time: " + str(0.0) + " " + str(median_time) + "\n")
Exemplo n.º 11
0
def main():
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh':
      print("ERROR: GRU does not currently support activations other than tanh")
      sys.exit()

    if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu':
      print("ERROR: We should be using ReLU RNNs")
      sys.exit()

    print("=======================================================")
    for arg in vars(args):
      print("***%s = %s " %  (arg.ljust(25), getattr(args, arg)))
    print("=======================================================")

    save_folder = args.save_folder

    loss_results, cer_results, wer_results = torch.Tensor(params.epochs), torch.Tensor(params.epochs), torch.Tensor(params.epochs)
    best_wer = None
    try:
        os.makedirs(save_folder)
    except OSError as e:
        if e.errno == errno.EEXIST:
            print('Directory already exists.')
        else:
            raise
    #cuda = torch.device('cuda')
    criterion = torch.nn.CTCLoss()#.to(cuda)

    with open(params.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))
    # audio_conf = dict(sample_rate=params.sample_rate,
    #                   window_size=params.window_size,
    #                   window_stride=params.window_stride,
    #                   window=params.window,
    #                   noise_dir=params.noise_dir,
    #                   noise_prob=params.noise_prob,
    #                   noise_levels=(params.noise_min, params.noise_max))

    # train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.train_manifest, labels=labels,
    #                                    normalize=True, augment=params.augment)
    # test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.val_manifest, labels=labels,
    #                                   normalize=True, augment=False)
    # train_loader = AudioDataLoader(train_dataset, batch_size=params.batch_size,
    #                                num_workers=1)
    # test_loader = AudioDataLoader(test_dataset, batch_size=params.batch_size,
    #                               num_workers=1)

    rnn_type = params.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"

    model = DeepSpeech(rnn_hidden_size = params.hidden_size,
                       nb_layers       = params.hidden_layers,
                       labels          = labels,
                       rnn_type        = supported_rnns[rnn_type],
                       audio_conf      = None,
                       bidirectional   = True,
                       rnn_activation  = params.rnn_act_type,
                       bias            = params.bias)

    parameters = model.parameters()
    optimizer = torch.optim.SGD(parameters, lr=params.lr,
                                momentum=params.momentum, nesterov=True,
                                weight_decay = params.l2)
    # decoder = GreedyDecoder(labels)

    if args.continue_from:
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from)
        model.load_state_dict(package['state_dict'])
        optimizer.load_state_dict(package['optim_dict'])
        start_epoch = int(package.get('epoch', 1)) - 1  # Python index start at 0 for training
        start_iter = package.get('iteration', None)
        if start_iter is None:
            start_epoch += 1  # Assume that we saved a model after an epoch finished, so start at the next epoch.
            start_iter = 0
        else:
            start_iter += 1
        avg_loss = int(package.get('avg_loss', 0))

        if args.start_epoch != -1:
          start_epoch = args.start_epoch

        loss_results[:start_epoch], cer_results[:start_epoch], wer_results[:start_epoch] = package['loss_results'][:start_epoch], package[ 'cer_results'][:start_epoch], package['wer_results'][:start_epoch]
        print(loss_results)
        epoch = start_epoch

    else:
        avg_loss = 0
        start_epoch = 0
        start_iter = 0
        avg_training_loss = 0
    if params.cuda:
        model = torch.nn.DataParallel(model).cuda()

    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    ctc_time = AverageMeter()
    forward_time = AverageMeter()
    backward_time = AverageMeter()

    filename = "/scratch/wu636/Lantern/src/out/PLDI19evaluation/deepspeech2/ds2-pytorch/data/test/deepspeech_train.pickle"
    # filename = "/scratch/wu636/training/speech_recognition/data/test/deep_speech_train.pickle"
    batchedData = user_defined_input.Batch(filename)

    for epoch in range(start_epoch, params.epochs):
        model.train()
        end = time.time()
        for i in range(batchedData.numBatches):
            inputs, targets, input_percentages, target_sizes = batchedData.batch()
            inputs = torch.from_numpy(inputs)
            targets = torch.from_numpy(targets)
            input_percentages = torch.from_numpy(input_percentages)
            target_sizes = torch.from_numpy(target_sizes)
            # measure data loading time
            data_time.update(time.time() - end)
            inputs = Variable(inputs, requires_grad=False)
            target_sizes = Variable(target_sizes, requires_grad=False)
            targets = Variable(targets, requires_grad=False)

            if params.cuda:
                inputs = inputs.cuda()

            # measure forward pass time
            forward_start_time = time.time()
            out = model(inputs)
            out = out.transpose(0, 1)  # TxNxH

            seq_length = out.size(0)
            sizes = Variable(input_percentages.mul_(int(seq_length)).int(), requires_grad=False)

            # measure ctc loss computing time
            ctc_start_time = time.time()
            loss = criterion(out, targets, sizes, target_sizes)
            ctc_time.update(time.time() - ctc_start_time)

            loss = loss / inputs.size(0)  # average the loss by minibatch

            loss_sum = loss.data.sum()
            inf = float("inf")
            if loss_sum == inf or loss_sum == -inf:
                print("WARNING: received an inf loss, setting loss value to 0")
                loss_value = 0
            else:
                loss_value = loss.data.item()

            avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            forward_time.update(time.time() - forward_start_time)

            # measure backward pass time
            backward_start_time = time.time()
            # compute gradient
            optimizer.zero_grad()
            loss.backward()

            torch.nn.utils.clip_grad_norm(model.parameters(), params.max_norm)
            # SGD step
            optimizer.step()

            if params.cuda:
                torch.cuda.synchronize()

            backward_time.update(time.time() - backward_start_time)

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if ((i+1) % 20 == 0):
                print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Forward {forward_time.val:.3f} ({forward_time.avg:.3f})\t'
                  'CTC Time {ctc_time.val:.3f} ({ctc_time.avg:.3f})\t'
                  'Backward {backward_time.val:.3f} ({backward_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                (epoch + 1), (i + 1), batchedData.numBatches, batch_time=batch_time,
                data_time=data_time, forward_time=forward_time, ctc_time=ctc_time,
                backward_time=backward_time, loss=losses))

            del loss
            del out

        avg_loss /= batchedData.numBatches #  len(train_loader)

        print('Training Summary Epoch: [{0}]\t'
            'Average Loss {loss:.3f}\t'
            .format( epoch + 1, loss=avg_loss, ))
Exemplo n.º 12
0
with open(args.labels_path) as label_file:
    labels = str(''.join(json.load(label_file)))

audio_conf = dict(sample_rate=args.sample_rate,
                  window_size=args.window_size)

model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                   nb_layers=args.hidden_layers,
                   audio_conf=audio_conf,
                   labels=labels,
                   rnn_type=supported_rnns[rnn_type])

print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

parameters = model.parameters()
optimizer = torch.optim.SGD(parameters, lr=3e-4,
                            momentum=0.9, nesterov=True)
model.cuda()
if args.distributed:
    model = torch.nn.parallel.DistributedDataParallel(model)

criterion = CTCLoss()

seconds = int(args.seconds)
batch_size = int(args.batch_size)


def iteration(inputs):
    # targets, align half of the audio
    targets = torch.ones(int(batch_size * ((seconds * 100) / 2)))
    print(denoiser)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(denoiser))

    criterion = CTCLoss()
    criterion1 = torch.nn.L1Loss()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()

    for epoch in range(start_epoch, args.epochs):
        model.eval()
        for name, module in model.named_modules():
            #print(module._get_name())
            if module._get_name() in ['GRU']:
                module.train()
        for param in model.parameters():
            param.requires_grad = False
        denoiser.train()
        end = time.time()
        start_epoch_time = time.time()
        for i, (data) in enumerate(zip(train_loader_clean, train_loader_adv),
                                   start=start_iter):
            if i == len(train_sampler_clean):
                break
            data_clean = data[0]
            data_adv = data[1]
            inputs, targets, input_percentages, target_sizes = data_clean
            inputs_adv, targets_adv, input_percentages_adv, target_sizes_adv = data_adv

            input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
            input_sizes_adv = input_percentages_adv.mul_(int(
Exemplo n.º 14
0
        '''
        model_student = DeepSpeech.load_model_package(package)
        parameters_student = model_student.parameters()
        optimizer_student = torch.optim.SGD(parameters_student, lr=args.lr,
                                    momentum=args.momentum, nesterov=True)
        '''
        # restart student model from scratch
        rnn_type = args.rnn_type.lower()
        assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
        model_student = DeepSpeech(rnn_hidden_size=args.hidden_size,
                                   nb_layers=args.hidden_layers,
                                   labels=labels,
                                   rnn_type=supported_rnns[rnn_type],
                                   audio_conf=audio_conf,
                                   bidirectional=args.bidirectional)
        parameters_student = model_student.parameters()
        optimizer_student = torch.optim.SGD(parameters_student,
                                            lr=args.lr,
                                            momentum=args.momentum,
                                            nesterov=True)

        if not args.finetune:  # Don't want to restart training
            optimizer_teacher.load_state_dict(package['optim_dict'])

            # Temporary fix for pytorch #2830 & #1442 while pull request #3658 in not incorporated in a release
            # TODO : remove when a new release of pytorch include pull request #3658
            if args.cuda:
                for state in optimizer_teacher.state.values():
                    for k, v in state.items():
                        if torch.is_tensor(v):
                            state[k] = v.cuda()
Exemplo n.º 15
0
def main():
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh':
      print("ERROR: GRU does not currently support activations other than tanh")
      sys.exit()

    if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu':
      print("ERROR: We should be using ReLU RNNs")
      sys.exit()

    print("=======================================================")
    for arg in vars(args):
      print("***%s = %s " %  (arg.ljust(25), getattr(args, arg)))
    print("=======================================================")

    save_folder = args.save_folder

    loss_results, cer_results, wer_results = torch.Tensor(params.epochs), torch.Tensor(params.epochs), torch.Tensor(params.epochs)
    best_wer = None
    try:
        os.makedirs(save_folder)
    except OSError as e:
        if e.errno == errno.EEXIST:
            print('Directory already exists.')
        else:
            raise
    criterion = CTCLoss()

    with open(params.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))
    audio_conf = dict(sample_rate=params.sample_rate,
                      window_size=params.window_size,
                      window_stride=params.window_stride,
                      window=params.window,
                      noise_dir=params.noise_dir,
                      noise_prob=params.noise_prob,
                      noise_levels=(params.noise_min, params.noise_max))

    train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.train_manifest, labels=labels,
                                       normalize=True, augment=params.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.val_manifest, labels=labels,
                                      normalize=True, augment=False)
    train_loader = AudioDataLoader(train_dataset, batch_size=params.batch_size,
                                   num_workers=1)
    test_loader = AudioDataLoader(test_dataset, batch_size=params.batch_size,
                                  num_workers=1)

    rnn_type = params.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"

    model = DeepSpeech(rnn_hidden_size = params.hidden_size,
                       nb_layers       = params.hidden_layers,
                       labels          = labels,
                       rnn_type        = supported_rnns[rnn_type],
                       audio_conf      = audio_conf,
                       bidirectional   = True,
                       rnn_activation  = params.rnn_act_type,
                       bias            = params.bias)

    parameters = model.parameters()
    optimizer = torch.optim.SGD(parameters, lr=params.lr,
                                momentum=params.momentum, nesterov=True,
                                weight_decay = params.l2)
    decoder = GreedyDecoder(labels)

    if args.continue_from:
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from)
        model.load_state_dict(package['state_dict'])
        optimizer.load_state_dict(package['optim_dict'])
        start_epoch = int(package.get('epoch', 1)) - 1  # Python index start at 0 for training
        start_iter = package.get('iteration', None)
        if start_iter is None:
            start_epoch += 1  # Assume that we saved a model after an epoch finished, so start at the next epoch.
            start_iter = 0
        else:
            start_iter += 1
        avg_loss = int(package.get('avg_loss', 0))

        if args.start_epoch != -1:
          start_epoch = args.start_epoch

        loss_results[:start_epoch], cer_results[:start_epoch], wer_results[:start_epoch] = package['loss_results'][:start_epoch], package[ 'cer_results'][:start_epoch], package['wer_results'][:start_epoch]
        print(loss_results)
        epoch = start_epoch

    else:
        avg_loss = 0
        start_epoch = 0
        start_iter = 0
        avg_training_loss = 0
    if params.cuda:
        model         = torch.nn.DataParallel(model).cuda()

    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    ctc_time = AverageMeter()

    for epoch in range(start_epoch, params.epochs):
        model.train()
        end = time.time()
        for i, (data) in enumerate(train_loader, start=start_iter):
            if i == len(train_loader):
                break
            inputs, targets, input_percentages, target_sizes = data
            # measure data loading time
            data_time.update(time.time() - end)
            inputs = Variable(inputs, requires_grad=False)
            target_sizes = Variable(target_sizes, requires_grad=False)
            targets = Variable(targets, requires_grad=False)

            if params.cuda:
                inputs = inputs.cuda()

            out = model(inputs)
            out = out.transpose(0, 1)  # TxNxH

            seq_length = out.size(0)
            sizes = Variable(input_percentages.mul_(int(seq_length)).int(), requires_grad=False)

            ctc_start_time = time.time()
            loss = criterion(out, targets, sizes, target_sizes)
            ctc_time.update(time.time() - ctc_start_time)

            loss = loss / inputs.size(0)  # average the loss by minibatch

            loss_sum = loss.data.sum()
            inf = float("inf")
            if loss_sum == inf or loss_sum == -inf:
                print("WARNING: received an inf loss, setting loss value to 0")
                loss_value = 0
            else:
                loss_value = loss.data[0]

            avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            # compute gradient
            optimizer.zero_grad()
            loss.backward()

            torch.nn.utils.clip_grad_norm(model.parameters(), params.max_norm)
            # SGD step
            optimizer.step()

            if params.cuda:
                torch.cuda.synchronize()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'CTC Time {ctc_time.val:.3f} ({ctc_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time,
                data_time=data_time, ctc_time=ctc_time, loss=losses))

            del loss
            del out

        avg_loss /= len(train_loader)

        print('Training Summary Epoch: [{0}]\t'
            'Average Loss {loss:.3f}\t'
            .format( epoch + 1, loss=avg_loss, ))

        start_iter = 0  # Reset start iteration for next epoch
        total_cer, total_wer = 0, 0
        model.eval()

        wer, cer = eval_model( model, test_loader, decoder)

        loss_results[epoch] = avg_loss
        wer_results[epoch] = wer
        cer_results[epoch] = cer
        print('Validation Summary Epoch: [{0}]\t'
              'Average WER {wer:.3f}\t'
              'Average CER {cer:.3f}\t'.format(
            epoch + 1, wer=wer, cer=cer))

        if args.checkpoint:
            file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1)
            torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results,
                                            wer_results=wer_results, cer_results=cer_results),
                       file_path)
        # anneal lr
        optim_state = optimizer.state_dict()
        optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0]['lr'] / params.learning_anneal
        optimizer.load_state_dict(optim_state)
        print('Learning rate annealed to: {lr:.6f}'.format(lr=optim_state['param_groups'][0]['lr']))

        if best_wer is None or best_wer > wer:
            print("Found better validated model, saving to %s" % args.model_path)
            torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results,
                                            wer_results=wer_results, cer_results=cer_results)
                       , args.model_path)
            best_wer = wer

        avg_loss = 0

        #If set to exit at a given accuracy, exit
        if params.exit_at_acc and (best_wer <= args.acc):
            break

    print("=======================================================")
    print("***Best WER = ", best_wer)
    for arg in vars(args):
      print("***%s = %s " %  (arg.ljust(25), getattr(args, arg)))
    print("=======================================================")
Exemplo n.º 16
0
def convert(parser):
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh':
      print("ERROR: GRU does not currently support activations other than tanh")
      sys.exit()

    if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu':
      print("ERROR: We should be using ReLU RNNs")
      sys.exit()

    print("=======================================================")
    for arg in vars(args):
      print("***%s = %s " %  (arg.ljust(25), getattr(args, arg)))
    print("=======================================================")

    save_folder = args.save_folder

    try:
        os.makedirs(save_folder)
    except OSError as e:
        if e.errno == errno.EEXIST:
            print('Directory already exists.')
        else:
            raise

    with open(params.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))

    audio_conf = dict(sample_rate=params.sample_rate,
                      window_size=params.window_size,
                      window_stride=params.window_stride,
                      window=params.window,
                      noise_dir=params.noise_dir,
                      noise_prob=params.noise_prob,
                      noise_levels=(params.noise_min, params.noise_max))

    val_batch_size = min(8,params.batch_size_val)
    print("Using bs={} for validation. Parameter found was {}".format(val_batch_size,params.batch_size_val))

    train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.train_manifest, labels=labels,
                                       normalize=True, augment=params.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.val_manifest, labels=labels,
                                      normalize=True, augment=False)
    train_loader = AudioDataLoader(train_dataset, batch_size=params.batch_size,
                                   num_workers=(1 if params.cuda else 1))
    test_loader = AudioDataLoader(test_dataset, batch_size=val_batch_size,
                                  num_workers=(1 if params.cuda else 1))

    rnn_type = params.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"

    model = DeepSpeech(rnn_hidden_size = params.hidden_size,
                       nb_layers       = params.hidden_layers,
                       labels          = labels,
                       rnn_type        = supported_rnns[rnn_type],
                       audio_conf      = audio_conf,
                       bidirectional   = False,
                       rnn_activation  = params.rnn_act_type,
                       bias            = params.bias)

    parameters = model.parameters()

    if args.continue_from:
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from)
        model.load_state_dict(package['state_dict'])
        if params.cuda:
            model = model.cuda()

    if params.cuda:
        model = torch.nn.DataParallel(model).cuda()

    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    ####################################################
    #  Begin ONNX conversion
    ####################################################
    model.train(False)
    # Input to the model
    data = next(iter(train_loader))
    inputs, targets, input_percentages, target_sizes = data
    inputs = Variable(inputs, requires_grad=False)
    target_sizes = Variable(target_sizes, requires_grad=False)
    targets = Variable(targets, requires_grad=False)

    if params.cuda:
        inputs = inputs.cuda()

    x = inputs
    print(x.size())

    # Export the model
    onnx_file_path = osp.join(osp.dirname(args.continue_from),osp.basename(args.continue_from).split('.')[0]+".onnx")
    print("Saving new ONNX model to: {}".format(onnx_file_path))
    torch.onnx.export(model,                   # model being run
                      inputs,                  # model input (or a tuple for multiple inputs)
		              onnx_file_path,          # where to save the model (can be a file or file-like object)
                      export_params=True,      # store the trained parameter weights inside the model file
                      verbose=False)
Exemplo n.º 17
0
def main():
    args = parser.parse_args()
    save_folder = args.save_folder

    ########
    """
    loss_results, cer_results, wer_results = torch.Tensor(args.epochs), torch.Tensor(args.epochs), torch.Tensor(
        args.epochs)
    best_wer = None
    if args.visdom:
        from visdom import Visdom
        viz = Visdom()

        opts = [dict(title=args.visdom_id + ' Loss', ylabel='Loss', xlabel='Epoch'),
                dict(title=args.visdom_id + ' WER', ylabel='WER', xlabel='Epoch'),
                dict(title=args.visdom_id + ' CER', ylabel='CER', xlabel='Epoch')]

        viz_windows = [None, None, None]
        epochs = torch.arange(1, args.epochs + 1)
    if args.tensorboard:
        from logger import TensorBoardLogger
        try:
            os.makedirs(args.log_dir)
        except OSError as e:
            if e.errno == errno.EEXIST:
                print('Directory already exists.')
                for file in os.listdir(args.log_dir):
                    file_path = os.path.join(args.log_dir, file)
                    try:
                        if os.path.isfile(file_path):
                            os.unlink(file_path)
                    except Exception as e:
                        raise
            else:
                raise
        logger = TensorBoardLogger(args.log_dir)

    try:
        os.makedirs(save_folder)
    except OSError as e:
        if e.errno == errno.EEXIST:
            print('Directory already exists.')
        else:
            raise
    """
    ########

    ########
    """
    criterion = CTCLoss()
    """
    criterion = nn.CrossEntropyLoss()
    class_accu = tnt.meter.ClassErrorMeter(topk=[1], accuracy=True)
    class_accu_sum = tnt.meter.ClassErrorMeter(topk=[1], accuracy=True)
    class_accu_sum_120 = tnt.meter.ClassErrorMeter(topk=[1], accuracy=True)
    class_accu_sum_240 = tnt.meter.ClassErrorMeter(topk=[1], accuracy=True)
    class_accu_sum_360 = tnt.meter.ClassErrorMeter(topk=[1], accuracy=True)
    class_accu_sum_480 = tnt.meter.ClassErrorMeter(topk=[1], accuracy=True)
    ########

    with open(args.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))
    audio_conf = dict(sample_rate=args.sample_rate,
                      window_size=args.window_size,
                      window_stride=args.window_stride,
                      window=args.window,
                      noise_dir=args.noise_dir,
                      noise_prob=args.noise_prob,
                      noise_levels=(args.noise_min, args.noise_max))

    train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels,
                                       normalize=True, augment=args.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels,
                                      normalize=True, augment=False)
    train_loader = AudioDataLoader(train_dataset, batch_size=args.batch_size,
                                   num_workers=args.num_workers)
    test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size,
                                  num_workers=args.num_workers)

    rnn_type = args.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
    ########
    """
    model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                       nb_layers=args.hidden_layers,
                       labels=labels,
                       rnn_type=supported_rnns[rnn_type],
                       audio_conf=audio_conf,
                       bidirectional=True,
                       cnn_features=args.cnn_features)
    """
    model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                       nb_layers=args.hidden_layers,
                       labels=labels,
                       rnn_type=supported_rnns[rnn_type],
                       audio_conf=audio_conf,
                       bidirectional=True,
                       cnn_features=args.cnn_features,
                       kernel=args.kernel,
                       stride=args.stride)
    ########

    ########
    #print(list(model.rnns.modules()))
    #for rnn in model.rnns.modules():
    #    print(rnn)#.flatten_parameters()
    #def flat_model(model):
    #    for m in model.modules():
    #        if isinstance(m, nn.LSTM):
    #            m.flatten_parameters()
    ########

    parameters = model.parameters()
    optimizer = torch.optim.SGD(parameters, lr=args.lr,
                                momentum=args.momentum, nesterov=True)

    ########
    #scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.learning_rate_decay_epochs, gamma=args.learning_rate_decay_rate)
    #scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
    ########

    ########
    """
    decoder = GreedyDecoder(labels)
    """
    ########

    ########
    """
    if args.continue_from:
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from)
        model.load_state_dict(package['state_dict'])
        optimizer.load_state_dict(package['optim_dict'])
        start_epoch = int(package.get('epoch', 1)) - 1  # Python index start at 0 for training
        start_iter = package.get('iteration', None)
        if start_iter is None:
            start_epoch += 1  # Assume that we saved a model after an epoch finished, so start at the next epoch.
            start_iter = 0
        else:
            start_iter += 1
        avg_loss = int(package.get('avg_loss', 0))
        loss_results, cer_results, wer_results = package['loss_results'], package[
            'cer_results'], package['wer_results']
        if args.visdom and \
                        package['loss_results'] is not None and start_epoch > 0:  # Add previous scores to visdom graph
            x_axis = epochs[0:start_epoch]
            y_axis = [loss_results[0:start_epoch], wer_results[0:start_epoch], cer_results[0:start_epoch]]
            for x in range(len(viz_windows)):
                viz_windows[x] = viz.line(
                    X=x_axis,
                    Y=y_axis[x],
                    opts=opts[x],
                )
        if args.tensorboard and \
                        package['loss_results'] is not None and start_epoch > 0:  # Previous scores to tensorboard logs
            for i in range(start_epoch):
                info = {
                    'Avg Train Loss': loss_results[i],
                    'Avg WER': wer_results[i],
                    'Avg CER': cer_results[i]
                }
                for tag, val in info.items():
                    logger.scalar_summary(tag, val, i + 1)
        if not args.no_bucketing and epoch != 0:
            print("Using bucketing sampler for the following epochs")
            train_dataset = SpectrogramDatasetWithLength(audio_conf=audio_conf, manifest_filepath=args.train_manifest,
                                                         labels=labels,
                                                         normalize=True, augment=args.augment)
            sampler = BucketingSampler(train_dataset)
            train_loader.sampler = sampler
    else:
        avg_loss = 0
        start_epoch = 0
        start_iter = 0
    """
    avg_loss = 0
    start_epoch = 0
    start_iter = 0

    best_train_accu = 0
    best_train_accu_sum = 0
    best_train_accu_sum_120 = 0
    best_train_accu_sum_240 = 0
    best_train_accu_sum_360 = 0
    best_train_accu_sum_480 = 0
    best_test_accu = 0
    best_test_accu_sum = 0
    best_test_accu_sum_120 = 0
    best_test_accu_sum_240 = 0
    best_test_accu_sum_360 = 0
    best_test_accu_sum_480 = 0
    best_avg_loss = float("inf") # sys.float_info.max # 1000000
    epoch_70 = None
    epoch_90 = None
    epoch_95 = None
    epoch_99 = None

    if args.stride == 1: multiplier = 6
    if args.stride == 2: multiplier = 3
    if args.stride == 3: multiplier = 2
    if args.stride == 4: multiplier = 1  # (Should be 1.5...)

    #sample_time_steps = int(args.sample_miliseconds / 10)
    loss_begin = round(args.crop_begin / (10 * args.stride))
    loss_end = -round(args.crop_end / (10 * args.stride)) or None

    print("LOSS BEGIN:", loss_begin)
    print("LOSS END:", loss_end)
    ########

    if args.cuda:
        model = torch.nn.DataParallel(model).cuda()

    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()

    for epoch in range(start_epoch, args.epochs):
        ########
        #scheduler.step()
        optim_state_now = optimizer.state_dict()
        print('\nLEARNING RATE: {lr:.6f}'.format(lr=optim_state_now['param_groups'][0]['lr']))
        class_accu.reset()
        class_accu_sum.reset()
        class_accu_sum_120.reset()
        class_accu_sum_240.reset()
        class_accu_sum_360.reset()
        class_accu_sum_480.reset()
        ########
        model.train()
        end = time.time()
        for i, (data) in enumerate(train_loader, start=start_iter):
            if i == len(train_loader):
                break

            ########
            """
            inputs, targets, input_percentages, target_sizes = data
            """
            inputs, targets, input_percentages, target_sizes, speaker_labels = data
            ########

            # measure data loading time
            data_time.update(time.time() - end)
            inputs = Variable(inputs, requires_grad=False)

            ########
            """
            target_sizes = Variable(target_sizes, requires_grad=False)
            targets = Variable(targets, requires_grad=False)
            """
            speaker_labels = Variable(speaker_labels, requires_grad=False)
            ########

            if args.cuda:
                inputs = inputs.cuda()

            ########
            """
            out = model(inputs)
            """
            #temp_random = random.randint(0, (inputs.size(3)-1)-sample_time_steps)
            #print("INPUT", inputs[...,temp_random:temp_random+sample_time_steps].size(),temp_random, temp_random+sample_time_steps)
            #out = model(inputs[...,temp_random:temp_random+sample_time_steps])
            #print("OUTPUT", out.size())
            start = random.randint(0, int((inputs.size(3)-1)*(1-args.sample_proportion)))
            print("INPUT", inputs.size(3), inputs[...,start:start+int((inputs.size(3))*(args.sample_proportion))].size(),start, start+int((inputs.size(3))*(args.sample_proportion)))
            out = model(inputs[...,start:start+int((inputs.size(3))*(args.sample_proportion))])
            print("OUTPUT", out.size())
            ########

            out = out.transpose(0, 1)  # TxNxH

            ########
            speaker_labels = speaker_labels.cuda(async=True).long()
            # Prints the output of the model in a sequence of probabilities of char for each audio...
            torch.set_printoptions(profile="full")
            ####print("OUT: " + str(out.size()), "SPEAKER LABELS:" + str(speaker_labels.size()), "INPUT PERCENTAGES MEAN: " + str(input_percentages.mean()))
            """
            seq_length = out.size(0)
            sizes = Variable(input_percentages.mul_(int(seq_length)).int(), requires_grad=False)

            loss = criterion(out, targets, sizes, target_sizes)
            """
            #print(out[:,:,0])
            #print("SPEAKER LABELS: " + str(speaker_labels))
            #print(out[0][0])
            #softmax_output = F.softmax(out).data # This DOES NOT what I want...
            #softmax_output_alt = flex_softmax(out, axis=2).data # This is FINE!!! <<<===
            #print(softmax_output[0][0])
            #print(softmax_output_alt[0][0])
            ####new_out = torch.sum(out, 0)
            ####new_out = torch.sum(out[20:], 0)
            #print(out.size())
            #print(new_out.size())
            #print(out[-1].size())
            ########

            ########
            if args.loss_type == "reg":
                #loss_out = out[-1]; loss_speaker_labels = speaker_labels
                loss_out = out[round(out.size(0)/2)]; loss_speaker_labels = speaker_labels
                #print("LOSS TYPE = REGULAR")
            elif args.loss_type == "sum":
                loss_out = torch.sum(out[loss_begin:loss_end], 0); loss_speaker_labels = speaker_labels
                #print("LOSS TYPE = SUM")
            elif args.loss_type == "full":
                # Don't know if is ok!!! Don't use!!! => loss_out = out.contiguous().view(-1,48); loss_speaker_labels = speaker_labels.repeat(out.size(0)) #speaker_labels = speaker_labels.expand(20, out.size(0))
                # Don't know if is ok!!! Don't use!!! => loss_out = out.contiguous().view(-1,48); loss_speaker_labels = speaker_labels.repeat(1, out.size(0)).squeeze() #speaker_labels = speaker_labels.expand(20, out.size(0))
                loss_out = out.contiguous()[loss_begin:loss_end].view(-1,48); loss_speaker_labels = speaker_labels.repeat(out.size(0),1)[loss_begin:loss_end].view(-1) #speaker_labels = speaker_labels.expand(20, out.size(0))
                #print("LOSS TYPE = FULL")
            print("LOSS_OUT: " + str(loss_out.size()), "SPEAKER LABELS:" + str(loss_speaker_labels.size()))
            loss = criterion(loss_out, loss_speaker_labels)
            ########

            loss = loss / inputs.size(0)  # average the loss by minibatch

            loss_sum = loss.data.sum()
            inf = float("inf")
            if loss_sum == inf or loss_sum == -inf:
                print("WARNING: received an inf loss, setting loss value to 0")
                loss_value = 0
            else:
                loss_value = loss.data[0]

            avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            ########
            #if args.stride == 1: multiplier = 6
            #if args.stride == 2: multiplier = 3
            #if args.stride == 3: multiplier = 2
            #if args.stride == 4: multiplier = 1 #(Should be 1.5...)
            #if args.stride == 5: multiplier = 1 #(Should be 1.25...)

            class_accu.add(out[round(out.size(0)/2)].data, speaker_labels.data)
            class_accu_sum.add(torch.sum(out, 0).data, speaker_labels.data)

            #class_accu_sum_120.add(torch.sum(out[1*multiplier:-1*multiplier], 0).data, speaker_labels.data)
            #class_accu_sum_240.add(torch.sum(out[2*multiplier:-2*multiplier], 0).data, speaker_labels.data)
            #class_accu_sum_360.add(torch.sum(out[3*multiplier:-3*multiplier], 0).data, speaker_labels.data)
            #class_accu_sum_480.add(torch.sum(out[4*multiplier:-4*multiplier], 0).data, speaker_labels.data)
            ####class_accu_sum_120.add(torch.sum(out[round(out.size(0)/2)-1*multiplier:round(out.size(0)/2)+1*multiplier], 0).data, speaker_labels.data)
            ####class_accu_sum_240.add(torch.sum(out[round(out.size(0)/2)-2*multiplier:round(out.size(0)/2)+2*multiplier], 0).data, speaker_labels.data)
            ####class_accu_sum_360.add(torch.sum(out[round(out.size(0)/2)-3*multiplier:round(out.size(0)/2)+3*multiplier], 0).data, speaker_labels.data)
            ####class_accu_sum_480.add(torch.sum(out[round(out.size(0)/2)-4*multiplier:round(out.size(0)/2)+4*multiplier], 0).data, speaker_labels.data)

            #accu_out3 = torch.sum(flex_softmax(out[20:], axis=2), 0)
            #print(classaccu.value()[0], classaccu.value()[1])
            # Cross Entropy Loss for a Sequence (Time Series) of Output?
            #output = output.view(-1,29)
            #target = target.view(-1)
            #criterion = nn.CrossEntropyLoss()
            #loss = criterion(output,target)
            ########

            # compute gradient
            optimizer.zero_grad()
            loss.backward()

            torch.nn.utils.clip_grad_norm(model.parameters(), args.max_norm)
            # SGD step
            optimizer.step()

            if args.cuda:
                torch.cuda.synchronize()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            if not args.silent:

                ########
                """
                print('Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                    (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time,
                    data_time=data_time, loss=losses))
                """
                print('Epoch: [{0}][{1}/{2}]\t'
                      # 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      # 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'CAR {car:.3f}\t'
                      'CAR_SUM {car_sum:.3f}\t'
                      #'CAR_SUM_120 {car_sum_120:.3f}\t'
                      #'CAR_SUM_240 {car_sum_240:.3f}\t'
                      #'CAR_SUM_360 {car_sum_360:.3f}\t'
                      #'CAR_SUM_480 {car_sum_480:.3f}\t'
                      .format((epoch + 1), (i + 1), len(train_loader), batch_time=batch_time, data_time=data_time,
                              loss=losses, car=class_accu.value()[0], car_sum=class_accu_sum.value()[0],
                      #        car_sum_240=class_accu_sum_240.value()[0], car_sum_120=class_accu_sum_120.value()[0],
                      #        car_sum_360=class_accu_sum_360.value()[0], car_sum_480=class_accu_sum_480.value()[0]
                              )
                      )
                ########

            ########
            """
            if args.checkpoint_per_batch > 0 and i > 0 and (i + 1) % args.checkpoint_per_batch == 0:
                file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth.tar' % (save_folder, epoch + 1, i + 1)
                print("Saving checkpoint model to %s" % file_path)
                torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, iteration=i,
                                                loss_results=loss_results,
                                                wer_results=wer_results, cer_results=cer_results, avg_loss=avg_loss),
                           file_path)
            """
            ########

            del loss
            del out

            ########
            del loss_out
            del speaker_labels
            del loss_speaker_labels
            ########

        avg_loss /= len(train_loader)

        ########
        """
        print('Training Summary Epoch: [{0}]\t'
              'Average Loss {loss:.3f}\t'.format(
            epoch + 1, loss=avg_loss))
        """

        if (best_avg_loss > avg_loss): best_avg_loss = avg_loss

        print("\nCURRENT EPOCH TRAINING RESULTS:\t", class_accu.value()[0], "\t", class_accu_sum.value()[0],"\t",
              #class_accu_sum_120.value()[0], class_accu_sum_240.value()[0], class_accu_sum_360.value()[0], "\t", class_accu_sum_480.value()[0], "\n"
              )

        if (best_train_accu < class_accu.value()[0]): best_train_accu = class_accu.value()[0]
        if (best_train_accu_sum < class_accu_sum.value()[0]): best_train_accu_sum = class_accu_sum.value()[0]
        #if (best_train_accu_sum_120 < class_accu_sum_120.value()[0]): best_train_accu_sum_120 = class_accu_sum_120.value()[0]
        #if (best_train_accu_sum_240 < class_accu_sum_240.value()[0]): best_train_accu_sum_240 = class_accu_sum_240.value()[0]
        #if (best_train_accu_sum_360 < class_accu_sum_360.value()[0]): best_train_accu_sum_360 = class_accu_sum_360.value()[0]
        #if (best_train_accu_sum_480 < class_accu_sum_480.value()[0]): best_train_accu_sum_480 = class_accu_sum_480.value()[0]

        get_70 = ((class_accu.value()[0] > 70) or (class_accu_sum.value()[0] > 70)
                  #or (class_accu_sum_120.value()[0] > 70) or (class_accu_sum_240.value()[0] > 70)
                  #or (class_accu_sum_360.value()[0] > 70) or (class_accu_sum_480.value()[0] > 70)
                  )
        if ((epoch_70 is None) and (get_70 == True)): epoch_70 = epoch + 1
        get_90 = ((class_accu.value()[0] > 90) or (class_accu_sum.value()[0] > 90)
                  #or (class_accu_sum_120.value()[0] > 90) or (class_accu_sum_240.value()[0] > 90)
                  #or (class_accu_sum_360.value()[0] > 90) or (class_accu_sum_480.value()[0] > 90)
                  )
        if ((epoch_90 is None) and (get_90 == True)): epoch_90 = epoch + 1
        get_95 = ((class_accu.value()[0] > 95) or (class_accu_sum.value()[0] > 95)
                  #or (class_accu_sum_120.value()[0] > 95) or (class_accu_sum_240.value()[0] > 95)
                  #or (class_accu_sum_360.value()[0] > 95) or (class_accu_sum_480.value()[0] > 95)
                  )
        if ((epoch_95 is None) and (get_95 == True)): epoch_95 = epoch + 1
        get_99 = ((class_accu.value()[0] > 99) or (class_accu_sum.value()[0] > 99)
                  #or (class_accu_sum_120.value()[0] > 99) or (class_accu_sum_240.value()[0] > 99)
                  #or (class_accu_sum_360.value()[0] > 99) or (class_accu_sum_480.value()[0] > 99)
                  )
        if ((epoch_99 is None) and (get_99 == True)): epoch_99 = epoch + 1
        ########

        start_iter = 0  # Reset start iteration for next epoch
        total_cer, total_wer = 0, 0
        model.eval()

        ########
        class_accu.reset()
        class_accu_sum.reset()
        class_accu_sum_120.reset()
        class_accu_sum_240.reset()
        class_accu_sum_360.reset()
        class_accu_sum_480.reset()
        ########

        for i, (data) in enumerate(test_loader):  # test

            ########
            """
            inputs, targets, input_percentages, target_sizes = data
            """
            inputs, targets, input_percentages, target_sizes, speaker_labels = data
            ########

            inputs = Variable(inputs, volatile=True)

            ########
            speaker_labels = Variable(speaker_labels, requires_grad=False)
            speaker_labels = speaker_labels.cuda(async=True).long()
            """
            # unflatten targets
            split_targets = []
            offset = 0
            for size in target_sizes:
                split_targets.append(targets[offset:offset + size])
                offset += size
            """
            ########

            if args.cuda:
                inputs = inputs.cuda()

            out = model(inputs)
            out = out.transpose(0, 1)  # TxNxH

            ########
            speaker_labels = speaker_labels.cuda(async=True).long()
            # Prints the output of the model in a sequence of probabilities of char for each audio...
            torch.set_printoptions(profile="full")
            ########print("OUT: " + str(out.size()), "NEW OUT:" + str(new_out.size()), "SPEAKER LABELS:" + str(speaker_labels.size()), "INPUT PERCENTAGES MEAN: " + str(input_percentages.mean()))
            #print(out[:,:,0])
            #print("SPEAKER LABELS: " + str(speaker_labels))
            #print(out[0][0])
            #softmax_output = F.softmax(out).data # This DOES NOT what I want...
            #softmax_output_alt = flex_softmax(out, axis=2).data # This is FINE!!! <<<===
            #print(softmax_output[0][0])
            #print(softmax_output_alt[0][0])
            ########

            ########
            #if args.stride == 1: multiplier = 6
            #if args.stride == 2: multiplier = 3
            #if args.stride == 3: multiplier = 2
            #if args.stride == 4: multiplier = 1 #(Should be 1.5...)
            #if args.stride == 5: multiplier = 1 #(Should be 1.25...)

            class_accu.add(out[round(out.size(0)/2)].data, speaker_labels.data)
            class_accu_sum.add(torch.sum(out, 0).data, speaker_labels.data)

            class_accu_sum_120.add(torch.sum(out[1*multiplier:-1*multiplier], 0).data, speaker_labels.data)
            class_accu_sum_240.add(torch.sum(out[2*multiplier:-2*multiplier], 0).data, speaker_labels.data)
            class_accu_sum_360.add(torch.sum(out[3*multiplier:-3*multiplier], 0).data, speaker_labels.data)
            class_accu_sum_480.add(torch.sum(out[4*multiplier:-4*multiplier], 0).data, speaker_labels.data)
            #class_accu_sum_120.add(torch.sum(out[round(out.size(0)/2)-1*multiplier:round(out.size(0)/2)+1*multiplier], 0).data, speaker_labels.data)
            #class_accu_sum_240.add(torch.sum(out[round(out.size(0)/2)-2*multiplier:round(out.size(0)/2)+2*multiplier], 0).data, speaker_labels.data)
            #class_accu_sum_360.add(torch.sum(out[round(out.size(0)/2)-3*multiplier:round(out.size(0)/2)+3*multiplier], 0).data, speaker_labels.data)
            #class_accu_sum_480.add(torch.sum(out[round(out.size(0)/2)-4*multiplier:round(out.size(0)/2)+4*multiplier], 0).data, speaker_labels.data)

            #accu_out3 = torch.sum(flex_softmax(out[20:], axis=2), 0)
            #print(classaccu.value()[0], classaccu.value()[1])
            # Cross Entropy Loss for a Sequence (Time Series) of Output?
            #output = output.view(-1,29)
            #target = target.view(-1)
            #criterion = nn.CrossEntropyLoss()
            #loss = criterion(output,target)

            print('Validation Summary Epoch: [{0}]\t'
                  'CAR {car:.3f}\t'
                  'CAR_SUM {car_sum:.3f}\t'
                  'CAR_SUM_120 {car_sum_120:.3f}\t'
                  'CAR_SUM_240 {car_sum_240:.3f}\t'
                  'CAR_SUM_360 {car_sum_360:.3f}\t'
                  'CAR_SUM_480 {car_sum_480:.3f}\t'
                  .format(epoch + 1, car=class_accu.value()[0], car_sum=class_accu_sum.value()[0],
                          car_sum_240=class_accu_sum_240.value()[0], car_sum_120=class_accu_sum_120.value()[0],
                          car_sum_360=class_accu_sum_360.value()[0], car_sum_480=class_accu_sum_480.value()[0]
                          )
                  )
            """
            seq_length = out.size(0)
            sizes = input_percentages.mul_(int(seq_length)).int()            
            decoded_output = decoder.decode(out.data, sizes)
            target_strings = decoder.process_strings(decoder.convert_to_strings(split_targets))
            wer, cer = 0, 0
            for x in range(len(target_strings)):
                wer += decoder.wer(decoded_output[x], target_strings[x]) / float(len(target_strings[x].split()))
                cer += decoder.cer(decoded_output[x], target_strings[x]) / float(len(target_strings[x]))
            total_cer += cer
            total_wer += wer
            """
            ########

            if args.cuda:
                torch.cuda.synchronize()
            del out

        ########
        """
        wer = total_wer / len(test_loader.dataset)
        cer = total_cer / len(test_loader.dataset)
        wer *= 100
        cer *= 100
        loss_results[epoch] = avg_loss
        wer_results[epoch] = wer
        cer_results[epoch] = cer
        print('Validation Summary Epoch: [{0}]\t'
              'Average WER {wer:.3f}\t'
              'Average CER {cer:.3f}\t'.format(
            epoch + 1, wer=wer, cer=cer))
        """
        ########

        ########
        print("\nCURRENT EPOCH TEST RESULTS:\t", class_accu.value()[0], "\t", class_accu_sum.value()[0],
              "\t", class_accu_sum_120.value()[0], "\t", class_accu_sum_240.value()[0],
              "\t", class_accu_sum_360.value()[0], "\t", class_accu_sum_480.value()[0], "\n")

        if (best_test_accu < class_accu.value()[0]): best_test_accu = class_accu.value()[0]
        if (best_test_accu_sum < class_accu_sum.value()[0]): best_test_accu_sum = class_accu_sum.value()[0]
        if (best_test_accu_sum_120 < class_accu_sum_120.value()[0]): best_test_accu_sum_120 = class_accu_sum_120.value()[0]
        if (best_test_accu_sum_240 < class_accu_sum_240.value()[0]): best_test_accu_sum_240 = class_accu_sum_240.value()[0]
        if (best_test_accu_sum_360 < class_accu_sum_360.value()[0]): best_test_accu_sum_360 = class_accu_sum_360.value()[0]
        if (best_test_accu_sum_480 < class_accu_sum_480.value()[0]): best_test_accu_sum_480 = class_accu_sum_480.value()[0]

        print("\nBEST EPOCH TRAINING RESULTS:\t", best_train_accu, "\t", best_train_accu_sum,
              "\t", best_train_accu_sum_120, "\t", best_train_accu_sum_240,
              "\t", best_train_accu_sum_360, "\t", best_train_accu_sum_480)
        print("\nBEST EPOCH TEST RESULTS:\t", best_test_accu, "\t", best_test_accu_sum,
              "\t", best_test_accu_sum_120, "\t", best_test_accu_sum_240,
              "\t", best_test_accu_sum_360, "\t", best_test_accu_sum_480)
        print("\nEPOCHS 70%, 90%, 95%, 99%:\t", epoch_70, "\t", epoch_90, "\t", epoch_95, "\t", epoch_99)
        print("\nBEST AVERAGE LOSS:\t", best_avg_loss, "\n")
        ########

        ########
        """
        if args.visdom:
            # epoch += 1
            x_axis = epochs[0:epoch + 1]
            y_axis = [loss_results[0:epoch + 1], wer_results[0:epoch + 1], cer_results[0:epoch + 1]]
            for x in range(len(viz_windows)):
                if viz_windows[x] is None:
                    viz_windows[x] = viz.line(
                        X=x_axis,
                        Y=y_axis[x],
                        opts=opts[x],
                    )
                else:
                    viz.line(
                        X=x_axis,
                        Y=y_axis[x],
                        win=viz_windows[x],
                        update='replace',
                    )
        if args.tensorboard:
            info = {
                'Avg Train Loss': avg_loss,
                'Avg WER': wer,
                'Avg CER': cer
            }
            for tag, val in info.items():
                logger.scalar_summary(tag, val, epoch + 1)
            if args.log_params:
                for tag, value in model.named_parameters():
                    tag = tag.replace('.', '/')
                    logger.histo_summary(tag, to_np(value), epoch + 1)
                    logger.histo_summary(tag + '/grad', to_np(value.grad), epoch + 1)
        if args.checkpoint:
            file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1)
            torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results,
                                            wer_results=wer_results, cer_results=cer_results),
                       file_path)

        # anneal lr
        optim_state = optimizer.state_dict()
        optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0]['lr'] / args.learning_anneal
        optimizer.load_state_dict(optim_state)
        print('Learning rate annealed to: {lr:.6f}'.format(lr=optim_state['param_groups'][0]['lr']))

        if best_wer is None or best_wer > wer:
            print("Found better validated model, saving to %s" % args.model_path)
            torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results,
                                            wer_results=wer_results, cer_results=cer_results)
                       , args.model_path)
            best_wer = wer
        """
        ########

        avg_loss = 0
        if not args.no_bucketing and epoch == 0:
            print("Switching to bucketing sampler for following epochs")
            train_dataset = SpectrogramDatasetWithLength(audio_conf=audio_conf, manifest_filepath=args.train_manifest,
                                                         labels=labels,
                                                         normalize=True, augment=args.augment)
            sampler = BucketingSampler(train_dataset)
            train_loader.sampler = sampler
Exemplo n.º 18
0
audio_conf = dict(sample_rate=args.sample_rate,
                  window_size=args.window_size)

model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                   nb_layers=args.hidden_layers,
                   audio_conf=audio_conf,
                   labels=labels,
                   rnn_type=supported_rnns[rnn_type],
                   mixed_precision=args.mixed_precision)
model = model.to(device)
if args.mixed_precision:
    model = convert_model_to_half(model)
print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

parameters = model.parameters()
optimizer = torch.optim.SGD(parameters, lr=3e-4, momentum=0.9, nesterov=True, weight_decay=1e-5)
if args.distributed:
    model = DistributedDataParallel(model)
if args.mixed_precision:
    optimizer = FP16_Optimizer(optimizer,
                               static_loss_scale=args.static_loss_scale,
                               dynamic_loss_scale=args.dynamic_loss_scale)

criterion = CTCLoss()

seconds = int(args.seconds)
batch_size = int(args.batch_size)


def iteration(inputs):
Exemplo n.º 19
0
def main(args):

    # Set seeds for determinism
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    labels = LABELS

    audio_conf = dict(sample_rate=args.sample_rate,
                      window_size=args.window_size,
                      window_stride=args.window_stride,
                      window=args.window)

    rnn_type = args.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
    model = DeepSpeech(rnn_hidden_size=args.rnn_hidden_size,
                       nb_layers=args.hidden_layers,
                       labels=labels,
                       rnn_type=supported_rnns[rnn_type],
                       audio_conf=audio_conf,
                       bidirectional=args.bidirectional)

    # Data setup
    evaluation_decoder = GreedyDecoder(
        model.labels)  # Decoder used for validation

    train_df = pd.read_csv(args.train_path)
    train_dataset = SpeechDataset(args=args, df=train_df)

    test_df = pd.read_csv(args.test_path)
    test_dataset = SpeechDataset(args=args, df=test_df)

    train_loader = AudioDataLoader(dataset=train_dataset,
                                   num_workers=args.num_workers,
                                   batch_size=args.batch_size)

    test_loader = AudioDataLoader(dataset=test_dataset,
                                  num_workers=args.num_workers,
                                  batch_size=args.batch_size)

    model = model.to(args.device)
    parameters = model.parameters()

    optimizer = torch.optim.AdomW(parameters,
                                  lr=args.learning_rate,
                                  betas=args.betas,
                                  eps=args.eps,
                                  weight_decay=args.weight_decay)

    criterion = CTCLoss()

    best_score = 99999

    for epoch in range(args.epochs):
        train_loss = train_fn(args, train_loader, model, optimizer, criterion)
        wer, cer, output_data = test_fn(args=args,
                                        model=model,
                                        decoder=evaluation_decoder,
                                        target_decoder=evaluation_decoder)

        print('Validation Summary Epoch: [{0}]\t'
              'Average WER {wer:.3f}\t'
              'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer))

        if (wer + cer) / 2 < best_score:
            print("**** Model Improved !!!! Saving Model")
            torch.save(model.state_dict(), f"best_model.bin")
            best_score = (wer + cer) / 2
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers)

    if (not args.no_shuffle and start_epoch != 0) or args.no_sorta_grad:
        print("Shuffling batches for the following epochs")
        train_sampler.shuffle(start_epoch)

    try:
        model.load_state_dict(torch.load(args.weights)['state_dict'],
                              strict=False)
        print('using weights')
    except:
        print('not using weighs')
    model = model.to(device)
    parameters = model.parameters()
    optimizer = torch.optim.SGD(parameters,
                                lr=args.lr,
                                momentum=args.momentum,
                                nesterov=True,
                                weight_decay=1e-5)
    if optim_state is not None:
        optimizer.load_state_dict(optim_state)

    model, optimizer = amp.initialize(
        model,
        optimizer,
        opt_level=args.opt_level,
        keep_batchnorm_fp32=args.keep_batchnorm_fp32,
        loss_scale=args.loss_scale)
    if args.distributed:
Exemplo n.º 21
0
def main():
    args = parser.parse_args()

    params.cuda = not bool(args.cpu)
    print("Use cuda: {}".format(params.cuda))

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh':
        print(
            "ERROR: GRU does not currently support activations other than tanh"
        )
        sys.exit()

    if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu':
        print("ERROR: We should be using ReLU RNNs")
        sys.exit()

    print("=======================================================")
    for arg in vars(args):
        print("***%s = %s " % (arg.ljust(25), getattr(args, arg)))
    print("=======================================================")

    save_folder = args.save_folder

    loss_results, cer_results, wer_results = torch.Tensor(
        params.epochs), torch.Tensor(params.epochs), torch.Tensor(
            params.epochs)
    best_wer = None
    try:
        os.makedirs(save_folder)
    except OSError as e:
        if e.errno == errno.EEXIST:
            print('Directory already exists.')
        else:
            raise
    criterion = CTCLoss()

    with open(params.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))
    audio_conf = dict(sample_rate=params.sample_rate,
                      window_size=params.window_size,
                      window_stride=params.window_stride,
                      window=params.window,
                      noise_dir=params.noise_dir,
                      noise_prob=params.noise_prob,
                      noise_levels=(params.noise_min, params.noise_max))

    if args.use_set == 'libri':
        testing_manifest = params.val_manifest + ("_held" if args.hold_idx >= 0
                                                  else "")
    else:
        testing_manifest = params.test_manifest

    if args.batch_size_val > 0:
        params.batch_size_val = args.batch_size_val

    print("Testing on: {}".format(testing_manifest))
    train_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                       manifest_filepath=params.val_manifest,
                                       labels=labels,
                                       normalize=True,
                                       augment=params.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=testing_manifest,
                                      labels=labels,
                                      normalize=True,
                                      augment=False)
    train_loader = AudioDataLoader(train_dataset,
                                   batch_size=params.batch_size,
                                   num_workers=1)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=params.batch_size_val,
                                  num_workers=1)

    rnn_type = params.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"

    model = DeepSpeech(rnn_hidden_size=params.hidden_size,
                       nb_layers=params.hidden_layers,
                       labels=labels,
                       rnn_type=supported_rnns[rnn_type],
                       audio_conf=audio_conf,
                       bidirectional=False,
                       rnn_activation=params.rnn_act_type,
                       bias=params.bias)

    parameters = model.parameters()
    optimizer = torch.optim.SGD(parameters,
                                lr=params.lr,
                                momentum=params.momentum,
                                nesterov=True,
                                weight_decay=params.l2)
    decoder = GreedyDecoder(labels)

    if args.continue_from:
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from)
        model.load_state_dict(package['state_dict'])
        optimizer.load_state_dict(package['optim_dict'])
        start_epoch = int(package.get(
            'epoch', 1)) - 1  # Python index start at 0 for training
        start_iter = package.get('iteration', None)
        if start_iter is None:
            start_epoch += 1  # Assume that we saved a model after an epoch finished, so start at the next epoch.
            start_iter = 0
        else:
            start_iter += 1
        avg_loss = int(package.get('avg_loss', 0))

        if args.start_epoch != -1:
            start_epoch = args.start_epoch

        avg_loss = 0
        start_epoch = 0
        start_iter = 0
        avg_training_loss = 0
        epoch = 1
    else:
        avg_loss = 0
        start_epoch = 0
        start_iter = 0
        avg_training_loss = 0
    if params.cuda:
        model = torch.nn.DataParallel(model).cuda()
        # model         = torch.nn.parallel.DistributedDataParallel(model).cuda()

    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    ctc_time = AverageMeter()

    for epoch in range(start_epoch, params.epochs):

        #################################################################################################################
        #                    The test script only really cares about this section.
        #################################################################################################################
        model.eval()

        wer, cer, trials = eval_model_verbose(model, test_loader, decoder,
                                              params.cuda, args.n_trials)
        root = os.getcwd()
        outfile = osp.join(
            root,
            "inference_bs{}_i{}_gpu{}.csv".format(params.batch_size_val,
                                                  args.hold_idx, params.cuda))
        print("Exporting inference to: {}".format(outfile))
        make_file(outfile)
        write_line(
            outfile, "batch times pre normalized by hold_sec =,{}\n".format(
                args.hold_sec))
        write_line(outfile, "wer, {}\n".format(wer))
        write_line(outfile, "cer, {}\n".format(cer))
        write_line(outfile, "bs, {}\n".format(params.batch_size_val))
        write_line(outfile, "hold_idx, {}\n".format(args.hold_idx))
        write_line(outfile, "cuda, {}\n".format(params.cuda))
        write_line(outfile,
                   "avg batch time, {}\n".format(trials.avg / args.hold_sec))
        percentile_50 = np.percentile(
            trials.array, 50) / params.batch_size_val / args.hold_sec
        write_line(outfile, "50%-tile latency, {}\n".format(percentile_50))
        percentile_99 = np.percentile(
            trials.array, 99) / params.batch_size_val / args.hold_sec
        write_line(outfile, "99%-tile latency, {}\n".format(percentile_99))
        write_line(outfile, "through put, {}\n".format(1 / percentile_50))
        write_line(outfile, "data\n")
        for trial in trials.array:
            write_line(outfile, "{}\n".format(trial / args.hold_sec))

        loss_results[epoch] = avg_loss
        wer_results[epoch] = wer
        cer_results[epoch] = cer
        print('Validation Summary Epoch: [{0}]\t'
              'Average WER {wer:.3f}\t'
              'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer))

        # anneal lr
        optim_state = optimizer.state_dict()
        optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0][
            'lr'] / params.learning_anneal
        optimizer.load_state_dict(optim_state)
        print('Learning rate annealed to: {lr:.6f}'.format(
            lr=optim_state['param_groups'][0]['lr']))

        break

    print("=======================================================")
    print("***Best WER = ", best_wer)
    for arg in vars(args):
        print("***%s = %s " % (arg.ljust(25), getattr(args, arg)))
    print("=======================================================")