Пример #1
0
    def decode(self,
               y,
               pred,
               batch_no,
               batch_size,
               debuglines=1,
               logfile='decode_dev.txt'):
        ys = output_to_sequence(y, self.index_map)
        preds = output_to_sequence(pred, self.index_map)

        ground_truth = ''
        predstr = ''

        reallines = min(debuglines, len(ys))

        twordn = tworde = 0
        tcharn = tchare = 0
        self.log('validation batch:{},'.format(batch_no), logfile=logfile)
        for i in range(reallines):
            ground_truth = ys[i].encode('utf-8')
            predstr = preds[i].encode('utf-8')
            we, wn = wer(ground_truth.split(), predstr.split(), False)
            ce, cn = cer(ground_truth, predstr, False)
            twordn += wn
            tworde += we
            tcharn += cn
            tchare += ce
            self.log('Truth=>:' + ground_truth, logfile=logfile)
            self.log('ASR===>:' + predstr +
                     ',wer:{},cer:{}'.format(we * 1.0 / wn, ce * 1.0 / cn),
                     logfile=logfile)
        return tworde, twordn, tchare, tcharn
Пример #2
0
def main():
    get_list(sys.argv[1])
    res = 0.0
    for i in range(len(exp_list)):
        res += wer(exp_list[i].split(), actual_list[i].split())
    #res = wer("what is it".split(), "what is".split())

    print("AVG:\t", float(res / len(exp_list)))
Пример #3
0
    def decode(self,
               y,
               pred,
               batch_no,
               batch_size,
               debuglines=1,
               logfile='decode_dev.txt',
               label_seq_length=None):
        #print(y.shape)
        #print(y)
        if self.args.model == 'las':
            ys = output_to_sequence_dense(y,
                                          self.index_map,
                                          label_length=label_seq_length)
        else:
            ys = output_to_sequence(y, self.index_map)
        preds = output_to_sequence(pred, self.index_map)

        ground_truth = ''
        predstr = ''

        reallines = min(debuglines, len(ys))

        twordn = tworde = 0
        tcharn = tchare = 0
        self.log('decode batch:{},'.format(batch_no), logfile=logfile)
        for i in range(reallines):
            ground_truth = ys[i].encode('utf-8')
            predstr = preds[i].encode('utf-8')
            we, wn = wer(ground_truth.split(), predstr.split(), False)
            ce, cn = cer(ground_truth, predstr, False)
            twordn += wn
            tworde += we
            tcharn += cn
            tchare += ce
            self.log('Truth=>:' + ground_truth, logfile=logfile)
            self.log('ASR===>:' + predstr +
                     ',wer:{},cer:{}'.format(we * 1.0 / wn, ce * 1.0 / cn),
                     logfile=logfile)
        return tworde, twordn, tchare, tcharn
def main(learning_rate=5e-4,
         batch_size=5,
         epochs=1,
         experiment=Experiment(api_key='dummy_key', disabled=True)):
    hparams = {
        "n_cnn_layers": 3,
        "n_rnn_layers": 7,
        "rnn_dim": 1024,
        "n_class": 29,
        "n_feats": 128,
        "stride": 2,
        "dropout": 0.1,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "epochs": epochs
    }
    experiment.log_parameters(hparams)
    use_cuda = torch.cuda.is_available()
    torch.manual_seed(7)
    device = torch.device("cuda" if use_cuda else "cpu")

    train_dataset = Aduio_DataLoader(
        data_folder=r'D:\dataset\ntut-ml-2020-taiwanese-e2e\train',
        utterance_csv=
        r'D:\dataset\ntut-ml-2020-taiwanese-e2e\train-toneless_update.csv',
        sr=16000,
        dimension=480000)
    val_dataset = Aduio_DataLoader(
        data_folder=r'D:\dataset\ntut-ml-2020-taiwanese-e2e\val',
        utterance_csv=
        r'D:\dataset\ntut-ml-2020-taiwanese-e2e\train-toneless_update.csv',
        sr=16000,
        dimension=480000)

    train_loader = data.DataLoader(
        dataset=train_dataset,
        batch_size=hparams['batch_size'],
        shuffle=True,
        collate_fn=lambda x: data_processing(x, 'train'),
        num_workers=0,
        pin_memory=True)

    val_loader = data.DataLoader(
        dataset=val_dataset,
        batch_size=hparams['batch_size'],
        shuffle=False,
        collate_fn=lambda x: data_processing(x, 'val'),
        num_workers=0,
        pin_memory=True)
    model = SpeechRecognitionModel(hparams['n_cnn_layers'],
                                   hparams['n_rnn_layers'], hparams['rnn_dim'],
                                   hparams['n_class'], hparams['n_feats'],
                                   hparams['stride'],
                                   hparams['dropout']).to(device)
    # print(model)
    print('Num Model Parameters',
          sum([param.nelement() for param in model.parameters()]))
    optimizer = optim.AdamW(model.parameters(), hparams['learning_rate'])
    criterion = nn.CTCLoss(blank=28).to(device)
    # scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
    #                                            T_max=hparams['epochs'],
    #                                            eta_min=1e-6,
    #                                            last_epoch=-1)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer,
                                              max_lr=hparams['learning_rate'],
                                              steps_per_epoch=int(
                                                  len(train_loader)),
                                              epochs=hparams['epochs'],
                                              anneal_strategy='cos')

    scaler = torch.cuda.amp.GradScaler()
    train_data_len = len(train_loader.dataset)
    train_epoch_size = math.ceil(train_data_len / batch_size)
    val_data_len = len(val_loader.dataset)
    val_epoch_size = math.ceil(val_data_len / batch_size)
    iter_meter = IterMeter()
    train_losses, val_losses, wers, lrs = [], [], [], []
    for epoch in range(1, epochs + 1):
        print('running epoch: {} / {}'.format(epoch, epochs))
        start_time = time.time()
        # 訓練模式
        model.train()
        train_loss = 0
        with tqdm(total=train_epoch_size,
                  desc='train',
                  postfix=dict,
                  mininterval=0.3) as pbar:
            with experiment.train():
                for batch_idx, _data in enumerate(train_loader):
                    spectrograms, labels, input_lengths, label_lengths, _ = _data
                    # print(input_lengths)
                    spectrograms, labels = spectrograms.to(device), labels.to(
                        device)
                    optimizer.zero_grad()
                    output = model(spectrograms)
                    output = F.log_softmax(output, dim=2)
                    output = output.transpose(0, 1)
                    loss = criterion(output, labels, input_lengths,
                                     label_lengths)
                    loss.backward()
                    optimizer.step()
                    scheduler.step()
                    experiment.log_metric('loss',
                                          loss.item(),
                                          step=iter_meter.get())
                    experiment.log_metric('learning_rate',
                                          scheduler.get_last_lr(),
                                          step=iter_meter.get())
                    iter_meter.step()
                    waste_time = time.time() - start_time
                    train_loss += loss.item() * spectrograms.size(0)
                    pbar.set_postfix(
                        **{
                            'loss': loss.item(),
                            'lr': round(scheduler.get_last_lr()[0], 6),
                            'step/s': waste_time
                        })
                    pbar.update(1)
                    start_time = time.time()
                    lrs.append(scheduler.get_last_lr()[0])

        start_time = time.time()
        # 評估模式
        model.eval()
        val_loss = 0
        val_cer, val_wer = [], []
        with tqdm(total=val_epoch_size,
                  desc='val',
                  postfix=dict,
                  mininterval=0.3) as pbar:
            with experiment.test():
                with torch.no_grad():
                    for I, _data in enumerate(val_loader):
                        spectrograms, labels, input_lengths, label_lengths, _ = _data
                        spectrograms, labels = spectrograms.to(
                            device), labels.to(device)
                        output = model(spectrograms)  # (batch, time, n_class)
                        output = F.log_softmax(output, dim=2)
                        output = output.transpose(0,
                                                  1)  # (time, batch, n_class)
                        loss = criterion(output, labels, input_lengths,
                                         label_lengths)
                        val_loss += loss.item() * spectrograms.size(0)
                        decoded_preds, decoded_targets = GreedyDecoder(
                            output.transpose(0, 1), labels, label_lengths)
                        for j in range(len(decoded_preds)):
                            # val_cer.append(cer(decoded_targets[j], decoded_preds[j]))
                            val_wer.append(
                                wer(reference=decoded_targets[j],
                                    hypothesis=decoded_preds[j]))
                        waste_time = time.time() - start_time
                        pbar.set_postfix(**{
                            'loss': loss.item(),
                            'step/s': waste_time
                        })
                        pbar.update(1)
                        start_time = time.time()
        train_loss = train_loss / len(train_loader.dataset)
        val_loss = val_loss / len(val_loader.dataset)
        train_losses.append(train_loss)
        val_losses.append(val_loss)

        # avg_cer = sum(val_cer) / len(val_cer)
        avg_wer = sum(val_wer) / len(val_wer)
        wers.append(avg_wer)

        experiment.log_metric('val_loss', val_loss, step=iter_meter.get())
        # experiment.log_metric('cer', avg_cer, step=iter_meter.get())
        experiment.log_metric('wer', avg_wer, step=iter_meter.get())
        # print(
        #     'Val set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n'
        #     .format(val_loss, avg_cer, avg_wer))
        print(
            'average train_loss: {:.4f}, average val_loss: {:.4f}, average wer: {:.4f}\n'
            .format(train_loss, val_loss, avg_wer))
        torch.save(
            model.state_dict(),
            './logs/epoch%d-train_loss%.4f-val_loss%.4f-avg_wer%.4f.pth' %
            (epoch, train_loss, val_loss, avg_wer))

    # 繪製圖
    plt.figure()
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Val Loss')
    plt.legend(loc='best')
    plt.savefig('./images/loss.jpg')
    plt.show()

    plt.figure()
    plt.xlabel('Epochs')
    plt.ylabel('WER')
    plt.plot(wers, label='WER')
    plt.legend(loc='best')
    plt.savefig('./images/wer.jpg')
    plt.show()

    plt.figure()
    plt.xlabel('Mini-batch')
    plt.ylabel('Learning Rate')
    plt.plot(lrs, label='Learning Rate')
    plt.legend(loc='best')
    plt.savefig('./images/lr.jpg')
    plt.show()
Пример #5
0
def train(model, device, train_loader, test_loader, criterion, optimizer,
          scheduler, epochs, epoch, train_epoch_size, val_epoch_size,
          iter_meter, experiment):
    print('running epoch: {} / {}'.format(epoch, epochs))
    start_time = time.time()
    # 訓練模式
    model.train()
    data_len = len(train_loader.dataset)
    with tqdm(total=train_epoch_size,
              desc='train',
              postfix=dict,
              mininterval=0.3) as pbar:
        with experiment.train():
            for batch_idx, _data in enumerate(train_loader):
                spectrograms, labels, input_lengths, label_lengths = _data
                spectrograms, labels = spectrograms.to(device), labels.to(
                    device)
                optimizer.zero_grad()
                output = model(spectrograms)  # (batch, time, n_class)
                output = F.log_softmax(output, dim=2)
                output = output.transpose(0, 1)  # (time, batch, n_class)
                loss = criterion(output, labels, input_lengths, label_lengths)

                loss.backward()
                experiment.log_metric('loss',
                                      loss.item(),
                                      step=iter_meter.get())
                experiment.log_metric('learning_rate',
                                      scheduler.get_last_lr(),
                                      step=iter_meter.get())
                optimizer.step()
                scheduler.step()
                iter_meter.step()

                waste_time = time.time() - start_time
                pbar.set_postfix(
                    **{
                        'total_loss': loss.item(),
                        'lr': round(scheduler.get_last_lr()[0], 5),
                        'step/s': waste_time
                    })
                pbar.update(1)
                start_time = time.time()

    start_time = time.time()
    # 評估模式
    model.eval()
    test_loss = 0
    test_cer, test_wer = [], []
    with tqdm(total=val_epoch_size, desc='val', postfix=dict,
              mininterval=0.3) as pbar:
        with experiment.test():
            with torch.no_grad():
                for I, _data in enumerate(test_loader):
                    spectrograms, labels, input_lengths, label_lengths = _data
                    spectrograms, labels = spectrograms.to(device), labels.to(
                        device)
                    output = model(spectrograms)  # (batch, time, n_class)
                    output = F.log_softmax(output, dim=2)
                    output = output.transpose(0, 1)  # (time, batch, n_class)
                    loss = criterion(output, labels, input_lengths,
                                     label_lengths)
                    test_loss += loss.item() / len(test_loader)
                    decoded_preds, decoded_targets = GreedyDecoder(
                        output.transpose(0, 1), labels, label_lengths)
                    for j in range(len(decoded_preds)):
                        # test_cer.append(cer(decoded_targets[j], decoded_preds[j]))
                        test_wer.append(
                            wer(reference=decoded_targets[j],
                                hypothesis=decoded_preds[j]))
                    waste_time = time.time() - start_time
                    pbar.set_postfix(**{
                        'total_loss': loss.item(),
                        'step/s': waste_time
                    })
                    pbar.update(1)
                    start_time = time.time()
    # avg_cer = sum(test_cer) / len(test_cer)
    avg_wer = sum(test_wer) / len(test_wer)
    experiment.log_metric('test_loss', test_loss, step=iter_meter.get())
    # experiment.log_metric('cer', avg_cer, step=iter_meter.get())
    experiment.log_metric('wer', avg_wer, step=iter_meter.get())
    # print(
    #     'Test set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n'
    #     .format(test_loss, avg_cer, avg_wer))
    print('Test set: Average loss: {:.4f}, Average WER: {:.4f}\n'.format(
        test_loss, avg_wer))
    torch.save(
        model.state_dict(), './logs/epoch%d-val_loss%.4f-avg_wer%.4f.pth' %
        (epoch, total_loss / test_loss, avg_wer))