예제 #1
0
def run_transcribe(audio_path: str, spect_parser: SpectrogramParser,
                   model: DeepSpeech, decoder: Decoder, device: torch.device,
                   use_half: bool):
    # audio_path
    # try:
    #     # inTranscript = audio_path.replace("wav", "txt")
    #     # print(inTranscript)
    #     # getTranscript(inTranscript)
    #     pass
    # except Exception as asd:
    #     print(asd)
    #     pass
    spect = spect_parser.parse_audio(audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    spect = spect.to(device)
    if use_half:
        spect = spect.half()
    input_sizes = torch.IntTensor([spect.size(3)]).int()
    out, output_sizes = model(spect, input_sizes)
    decoded_output, decoded_offsets = decoder.decode(out, output_sizes)

    #Thêm vào greedy
    decoder2 = GreedyDecoder(labels=model.labels,
                             blank_index=model.labels.index('_'))
    decoded_output2, decoded_offsets2 = decoder2.decode(out, output_sizes)

    return decoded_output, decoded_output2, decoded_offsets, decoded_offsets2
예제 #2
0
    def __init__(self, labels: List,
                 model_cfg: Union[UniDirectionalConfig, BiDirectionalConfig,
                                  ConvolutionConfig], precision: int,
                 optim_cfg: Union[AdamConfig,
                                  SGDConfig], spect_cfg: SpectConfig):
        super().__init__()
        self.save_hyperparameters()
        self.model_cfg = model_cfg
        self.precision = precision
        self.optim_cfg = optim_cfg
        self.spect_cfg = spect_cfg
        self.convolutional = True if OmegaConf.get_type(
            model_cfg) is ConvolutionConfig else False
        self.bidirectional = True if OmegaConf.get_type(
            model_cfg) is BiDirectionalConfig else False

        self.labels = labels

        self.conv = MaskConv(
            nn.Sequential(
                nn.Conv2d(1,
                          32,
                          kernel_size=(41, 11),
                          stride=(2, 2),
                          padding=(20, 5)), nn.BatchNorm2d(32),
                nn.Hardtanh(0, 20, inplace=True),
                nn.Conv2d(32,
                          32,
                          kernel_size=(21, 11),
                          stride=(2, 1),
                          padding=(10, 5)), nn.BatchNorm2d(32),
                nn.Hardtanh(0, 20, inplace=True)))
        # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1
        rnn_input_size = int(
            math.floor((self.spect_cfg.sample_rate *
                        self.spect_cfg.window_size) / 2) + 1)
        rnn_input_size = int(math.floor(rnn_input_size + 2 * 20 - 41) / 2 + 1)
        rnn_input_size = int(math.floor(rnn_input_size + 2 * 10 - 21) / 2 + 1)
        rnn_input_size *= 32

        if self.convolutional is False:
            self.rnns, self.lookahead, self.fc = self._rnn_construct(
                rnn_input_size)

        else:
            self.deep_conv, self.fc = self._conv_construct(rnn_input_size)

        self.inference_softmax = InferenceBatchSoftmax()
        self.criterion = CTCLoss(blank=self.labels.index('_'),
                                 reduction='sum',
                                 zero_infinity=True)
        self.evaluation_decoder = GreedyDecoder(
            self.labels)  # Decoder used for validation
        self.wer = WordErrorRate(decoder=self.evaluation_decoder,
                                 target_decoder=self.evaluation_decoder)
        self.cer = CharErrorRate(decoder=self.evaluation_decoder,
                                 target_decoder=self.evaluation_decoder)
예제 #3
0
def evaluate(cfg: EvalConfig):
    device = torch.device("cuda" if cfg.model.cuda else "cpu")

    model = load_model(device=device, model_path=cfg.model.model_path)

    decoder = load_decoder(labels=model.labels, cfg=cfg.lm)
    target_decoder = GreedyDecoder(labels=model.labels,
                                   blank_index=model.labels.index('_'))
    test_dataset = SpectrogramDataset(audio_conf=model.spect_cfg,
                                      input_path=hydra.utils.to_absolute_path(
                                          cfg.test_path),
                                      labels=model.labels,
                                      normalize=True)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=cfg.batch_size,
                                  num_workers=cfg.num_workers)
    wer, cer = run_evaluation_print(test_loader=test_loader,
                                    device=device,
                                    model=model,
                                    decoder=decoder,
                                    target_decoder=target_decoder,
                                    precision=cfg.model.precision)

    print('Test Summary \t'
          'Average WER {wer:.3f}\t'
          'Average CER {cer:.3f}\t'.format(wer=wer, cer=cer))
    def __init__(self, cfg):
        self.cfg = cfg

        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.model = load_model(
            self.device, hydra.utils.to_absolute_path(self.cfg.model_path))
        self.ckpt = torch.load(hydra.utils.to_absolute_path(
            self.cfg.model_path),
                               map_location=self.device)
        self.labels = self.ckpt['hyper_parameters']['labels']

        self.decoder = BeamCTCDecoder(labels=self.labels,
                                      lm_path=hydra.utils.to_absolute_path(
                                          self.cfg.lm_path),
                                      beam_width=self.cfg.beam_width,
                                      num_processes=self.cfg.num_workers,
                                      blank_index=self.labels.index('_'))
        self.target_decoder = GreedyDecoder(labels=self.labels,
                                            blank_index=self.labels.index('_'))

        test_dataset = SpectrogramDataset(
            audio_conf=self.cfg.spect_cfg,
            input_path=hydra.utils.to_absolute_path(cfg.test_path),
            labels=self.labels,
            normalize=True)
        self.test_loader = AudioDataLoader(test_dataset,
                                           batch_size=self.cfg.batch_size,
                                           num_workers=self.cfg.num_workers)
예제 #5
0
def evaluate(cfg: EvalConfig):
    device = torch.device("cuda" if cfg.model.cuda else "cpu")

    model = load_model(device=device,
                       model_path=cfg.model.model_path,
                       use_half=cfg.model.use_half)

    decoder = load_decoder(labels=model.labels, cfg=cfg.lm)
    target_decoder = GreedyDecoder(model.labels,
                                   blank_index=model.labels.index('_'))
    test_dataset = SpectrogramDataset(
        audio_conf=model.audio_conf,
        manifest_filepath=hydra.utils.to_absolute_path(cfg.test_manifest),
        labels=model.labels,
        normalize=True)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=cfg.batch_size,
                                  num_workers=cfg.num_workers)
    wer, cer, output_data = run_evaluation(test_loader=test_loader,
                                           device=device,
                                           model=model,
                                           decoder=decoder,
                                           target_decoder=target_decoder,
                                           save_output=cfg.save_output,
                                           verbose=cfg.verbose,
                                           use_half=cfg.model.use_half)

    print('Test Summary \t'
          'Average WER {wer:.3f}\t'
          'Average CER {cer:.3f}\t'.format(wer=wer, cer=cer))
    if cfg.save_output:
        torch.save(output_data, hydra.utils.to_absolute_path(cfg.save_output))
예제 #6
0
def load_decoder(decoder_type, labels, lm_path, alpha, beta, cutoff_top_n,
                 cutoff_prob, beam_width, lm_workers):
    if decoder_type == "beam":
        from deepspeech_pytorch.decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels=labels,
                                 lm_path=lm_path,
                                 alpha=alpha,
                                 beta=beta,
                                 cutoff_top_n=cutoff_top_n,
                                 cutoff_prob=cutoff_prob,
                                 beam_width=beam_width,
                                 num_processes=lm_workers)
    else:
        decoder = GreedyDecoder(labels=labels, blank_index=labels.index('_'))
    return decoder
예제 #7
0
def load_decoder(labels, cfg: LMConfig):
    if cfg.decoder_type == DecoderType.beam:
        from deepspeech_pytorch.decoder import BeamCTCDecoder
        if cfg.lm_path:
            cfg.lm_path = hydra.utils.to_absolute_path(cfg.lm_path)
        decoder = BeamCTCDecoder(labels=labels,
                                 lm_path=cfg.lm_path,
                                 alpha=cfg.alpha,
                                 beta=cfg.beta,
                                 cutoff_top_n=cfg.cutoff_top_n,
                                 cutoff_prob=cfg.cutoff_prob,
                                 beam_width=cfg.beam_width,
                                 num_processes=cfg.lm_workers)
    else:
        decoder = GreedyDecoder(labels=labels, blank_index=labels.index('_'))
    return decoder
예제 #8
0
파일: server.py 프로젝트: mjurkus/deep_lt
def main():
    import argparse
    global model, spect_parser, decoder, args, device
    parser = argparse.ArgumentParser(
        description='DeepSpeech transcription server')
    parser.add_argument('--host',
                        type=str,
                        default='0.0.0.0',
                        help='Host to be used by the server')
    parser.add_argument('--port',
                        type=int,
                        default=8888,
                        help='Port to be used by the server')
    parser = add_inference_args(parser)
    parser = add_decoder_args(parser)
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.DEBUG)

    logging.info('Setting up server...')
    torch.set_grad_enabled(False)
    device = torch.device("cuda" if args.cuda else "cpu")
    model = load_model(device, args.model_path, args.half)

    if args.decoder == "beam":
        from deepspeech_pytorch.decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(model.labels,
                                 lm_path=args.lm_path,
                                 alpha=args.alpha,
                                 beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n,
                                 cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width,
                                 num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(model.labels,
                                blank_index=model.labels.index('_'))

    spect_parser = SpectrogramParser(model.audio_conf, normalize=True)
    logging.info('Server initialised')
    app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
예제 #9
0
파일: test.py 프로젝트: mjurkus/deep_lt
    model = load_model(device, args.model_path)

    with open('labels.json') as label_file:
        labels = json.load(label_file)

    decoder = load_decoder(decoder_type=args.decoder,
                           labels=labels,
                           lm_path=args.lm_path,
                           alpha=args.alpha,
                           beta=args.beta,
                           cutoff_top_n=args.cutoff_top_n,
                           cutoff_prob=args.cutoff_prob,
                           beam_width=args.beam_width,
                           lm_workers=args.lm_workers)

    target_decoder = GreedyDecoder(labels)

    test_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
                                      manifest_filepath=args.test_manifest,
                                      labels=labels,
                                      normalize=True)

    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers)

    wer, cer, output_data = evaluate(test_loader=test_loader,
                                     device=device,
                                     model=model,
                                     decoder=decoder,
                                     target_decoder=target_decoder,
예제 #10
0
def train(cfg):
    config = dict(
        epochs=cfg.training.epochs,
        batch_size=cfg.data.batch_size,
        learning_rate=cfg.optim.learning_rate,
        rnn_type=cfg.model.rnn_type,
        hidden_size=cfg.model.hidden_size,
        hidden_layers=cfg.model.hidden_layers,
        optimizer=cfg.optim,
        #   update_hessian=cfg.optim.update_each
    )
    wandb.init(project="adahessian-deepspeech", config=config)

    # Set seeds for determinism
    torch.manual_seed(cfg.training.seed)
    torch.cuda.manual_seed_all(cfg.training.seed)
    np.random.seed(cfg.training.seed)
    random.seed(cfg.training.seed)
    torch.backends.cudnn.flags(enabled=False)

    main_proc = True
    device = torch.device("cpu" if cfg.training.no_cuda else "cuda")

    is_distributed = os.environ.get(
        "LOCAL_RANK")  # If local rank exists, distributed env

    if is_distributed:
        # when using NCCL, on failures, surviving nodes will deadlock on NCCL ops
        # because NCCL uses a spin-lock on the device. Set this env var and
        # to enable a watchdog thread that will destroy stale NCCL communicators
        os.environ["NCCL_BLOCKING_WAIT"] = "1"

        device_id = int(os.environ["LOCAL_RANK"])
        torch.cuda.set_device(device_id)
        print(f"Setting CUDA Device to {device_id}")

        dist.init_process_group(backend=cfg.training.dist_backend.value)
        main_proc = device_id == 0  # Main process handles saving of models and reporting

    if OmegaConf.get_type(cfg.checkpointing) == FileCheckpointConfig:
        checkpoint_handler = FileCheckpointHandler(cfg=cfg.checkpointing)
    elif OmegaConf.get_type(cfg.checkpointing) == GCSCheckpointConfig:
        checkpoint_handler = GCSCheckpointHandler(cfg=cfg.checkpointing)
    else:
        raise ValueError("Checkpoint Config has not been specified correctly.")

    if main_proc and cfg.visualization.visdom:
        visdom_logger = VisdomLogger(id=cfg.visualization.id,
                                     num_epochs=cfg.training.epochs)
    if main_proc and cfg.visualization.tensorboard:
        tensorboard_logger = TensorBoardLogger(
            id=cfg.visualization.id,
            log_dir=to_absolute_path(cfg.visualization.log_dir),
            log_params=cfg.visualization.log_params)

    if cfg.checkpointing.load_auto_checkpoint:
        latest_checkpoint = checkpoint_handler.find_latest_checkpoint()
        if latest_checkpoint:
            cfg.checkpointing.continue_from = latest_checkpoint

    if cfg.checkpointing.continue_from:  # Starting from previous model
        state = TrainingState.load_state(
            state_path=to_absolute_path(cfg.checkpointing.continue_from))
        model = state.model
        if cfg.training.finetune:
            state.init_finetune_states(cfg.training.epochs)

        if main_proc and cfg.visualization.visdom:  # Add previous scores to visdom graph
            visdom_logger.load_previous_values(state.epoch, state.results)
        if main_proc and cfg.visualization.tensorboard:  # Previous scores to tensorboard logs
            tensorboard_logger.load_previous_values(state.epoch, state.results)
    else:
        # Initialise new model training
        with open(to_absolute_path(cfg.data.labels_path)) as label_file:
            labels = json.load(label_file)

        if OmegaConf.get_type(cfg.model) is BiDirectionalConfig:
            model = DeepSpeech(
                rnn_hidden_size=cfg.model.hidden_size,
                nb_layers=cfg.model.hidden_layers,
                labels=labels,
                rnn_type=supported_rnns[cfg.model.rnn_type.value],
                audio_conf=cfg.data.spect,
                bidirectional=True)
        elif OmegaConf.get_type(cfg.model) is UniDirectionalConfig:
            model = DeepSpeech(
                rnn_hidden_size=cfg.model.hidden_size,
                nb_layers=cfg.model.hidden_layers,
                labels=labels,
                rnn_type=supported_rnns[cfg.model.rnn_type.value],
                audio_conf=cfg.data.spect,
                bidirectional=False,
                context=cfg.model.lookahead_context)
        else:
            raise ValueError("Model Config has not been specified correctly.")

        state = TrainingState(model=model)
        state.init_results_tracking(epochs=cfg.training.epochs)

    # Data setup
    evaluation_decoder = GreedyDecoder(
        model.labels)  # Decoder used for validation
    train_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
                                       manifest_filepath=to_absolute_path(
                                           cfg.data.train_manifest),
                                       labels=model.labels,
                                       normalize=True,
                                       augmentation_conf=cfg.data.augmentation)
    test_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
                                      manifest_filepath=to_absolute_path(
                                          cfg.data.val_manifest),
                                      labels=model.labels,
                                      normalize=True)
    if not is_distributed:
        train_sampler = DSRandomSampler(dataset=train_dataset,
                                        batch_size=cfg.data.batch_size,
                                        start_index=state.training_step)
    else:
        train_sampler = DSElasticDistributedSampler(
            dataset=train_dataset,
            batch_size=cfg.data.batch_size,
            start_index=state.training_step)
    train_loader = AudioDataLoader(dataset=train_dataset,
                                   num_workers=cfg.data.num_workers,
                                   batch_sampler=train_sampler)
    test_loader = AudioDataLoader(dataset=test_dataset,
                                  num_workers=cfg.data.num_workers,
                                  batch_size=cfg.data.batch_size)

    model = model.to(device)
    parameters = model.parameters()
    if OmegaConf.get_type(cfg.optim) is SGDConfig:
        optimizer = torch.optim.SGD(parameters,
                                    lr=cfg.optim.learning_rate,
                                    momentum=cfg.optim.momentum,
                                    nesterov=True,
                                    weight_decay=cfg.optim.weight_decay)
    elif OmegaConf.get_type(cfg.optim) is AdamConfig:
        optimizer = torch.optim.AdamW(parameters,
                                      lr=cfg.optim.learning_rate,
                                      betas=cfg.optim.betas,
                                      eps=cfg.optim.eps,
                                      weight_decay=cfg.optim.weight_decay)
    elif OmegaConf.get_type(cfg.optim) is AdaHessianConfig:
        optimizer = AdaHessian(
            parameters,
            lr=cfg.optim.learning_rate,
            betas=cfg.optim.betas,
            eps=cfg.optim.eps,
            weight_decay=cfg.optim.weight_decay,
            update_each=cfg.optim.update_each,
            #    average_conv_kernel=cfg.optim.average_conv_kernel,
            # hessian_power=cfg.optim.hessian_power
        )
        torch.backends.cudnn.enabled = False

    else:
        raise ValueError("Optimizer has not been specified correctly.")
    if OmegaConf.get_type(cfg.optim) is not AdaHessianConfig:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          enabled=not cfg.training.no_cuda,
                                          opt_level=cfg.apex.opt_level,
                                          loss_scale=cfg.apex.loss_scale)
    if state.optim_state is not None:
        optimizer.load_state_dict(state.optim_state)
    if state.amp_state is not None:
        amp.load_state_dict(state.amp_state)

    # Track states for optimizer/amp
    state.track_optim_state(optimizer)
    if not cfg.training.no_cuda and OmegaConf.get_type(
            cfg.optim) is not AdaHessianConfig:
        state.track_amp_state(amp)

    if is_distributed:
        model = DistributedDataParallel(model, device_ids=[device_id])
    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    criterion = CTCLoss()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()

    for epoch in range(state.epoch, cfg.training.epochs):
        model.train()
        end = time.time()
        start_epoch_time = time.time()
        state.set_epoch(epoch=epoch)
        train_sampler.set_epoch(epoch=epoch)
        train_sampler.reset_training_step(training_step=state.training_step)
        for i, (data) in enumerate(train_loader, start=state.training_step):
            state.set_training_step(training_step=i)
            inputs, targets, input_percentages, target_sizes = data
            input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
            # measure data loading time
            data_time.update(time.time() - end)
            inputs = inputs.to(device)

            out, output_sizes = model(inputs, input_sizes)
            out = out.transpose(0, 1)  # TxNxH

            float_out = out.float()  # ensure float32 for loss
            loss = criterion(float_out, targets, output_sizes,
                             target_sizes).to(device)
            loss = loss / inputs.size(0)  # average the loss by minibatch
            loss_value = loss.item()

            # Check to ensure valid loss was calculated
            valid_loss, error = check_loss(loss, loss_value)
            if valid_loss:
                optimizer.zero_grad()

                # compute gradient
                if OmegaConf.get_type(cfg.optim) is AdaHessianConfig:
                    loss.backward(create_graph=True)
                else:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               cfg.optim.max_norm)
                optimizer.step()
            else:
                print(error)
                print('Skipping grad update')
                loss_value = 0

            state.avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                      (epoch + 1), (i + 1),
                      len(train_loader),
                      batch_time=batch_time,
                      data_time=data_time,
                      loss=losses))

            if main_proc and cfg.checkpointing.checkpoint_per_iteration:
                checkpoint_handler.save_iter_checkpoint_model(epoch=epoch,
                                                              i=i,
                                                              state=state)
            del loss, out, float_out

        state.avg_loss /= len(train_dataset)

        epoch_time = time.time() - start_epoch_time
        print('Training Summary Epoch: [{0}]\t'
              'Time taken (s): {epoch_time:.0f}\t'
              'Average Loss {loss:.3f}\t'.format(epoch + 1,
                                                 epoch_time=epoch_time,
                                                 loss=state.avg_loss))

        with torch.no_grad():
            wer, cer, output_data = run_evaluation(
                test_loader=test_loader,
                device=device,
                model=model,
                decoder=evaluation_decoder,
                target_decoder=evaluation_decoder)

        state.add_results(epoch=epoch,
                          loss_result=state.avg_loss,
                          wer_result=wer,
                          cer_result=cer)

        print('Validation Summary Epoch: [{0}]\t'
              'Average WER {wer:.3f}\t'
              'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer))

        if main_proc and cfg.visualization.visdom:
            visdom_logger.update(epoch, state.result_state)
        if main_proc and cfg.visualization.tensorboard:
            tensorboard_logger.update(epoch, state.result_state,
                                      model.named_parameters())
        if main_proc and cfg.visualization.wandb:
            wandb.log({
                'epoch': epoch,
                'Average Loss': state.avg_loss,
                'Average WER': wer,
                'Average CER': cer
            })

        if main_proc and cfg.checkpointing.checkpoint:  # Save epoch checkpoint
            checkpoint_handler.save_checkpoint_model(epoch=epoch, state=state)
        # anneal lr
        for g in optimizer.param_groups:
            g['lr'] = g['lr'] / cfg.optim.learning_anneal
        print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr']))
        wandb.log({"lr": g['lr']})

        if main_proc and (state.best_wer is None or state.best_wer > wer):
            checkpoint_handler.save_best_model(epoch=epoch, state=state)
            state.set_best_wer(wer)
            state.reset_avg_loss()
        state.reset_training_step()  # Reset training step for next epoch
예제 #11
0
def train(cfg):
    # Set seeds for determinism
    torch.manual_seed(cfg.training.seed)
    torch.cuda.manual_seed_all(cfg.training.seed)
    np.random.seed(cfg.training.seed)
    random.seed(cfg.training.seed)

    main_proc = True
    device = torch.device("cpu" if cfg.training.no_cuda else "cuda")

    is_distributed = os.environ.get(
        "LOCAL_RANK")  # If local rank exists, distributed env

    if is_distributed:
        # when using NCCL, on failures, surviving nodes will deadlock on NCCL ops
        # because NCCL uses a spin-lock on the device. Set this env var and
        # to enable a watchdog thread that will destroy stale NCCL communicators
        os.environ["NCCL_BLOCKING_WAIT"] = "1"

        device_id = int(os.environ["LOCAL_RANK"])
        torch.cuda.set_device(device_id)
        print(f"Setting CUDA Device to {device_id}")

        dist.init_process_group(backend=cfg.training.dist_backend.value)
        main_proc = device_id == 0  # Main process handles saving of models and reporting

    if OmegaConf.get_type(cfg.checkpointing) == FileCheckpointConfig:
        checkpoint_handler = FileCheckpointHandler(cfg=cfg.checkpointing)
    elif OmegaConf.get_type(cfg.checkpointing) == GCSCheckpointConfig:
        checkpoint_handler = GCSCheckpointHandler(cfg=cfg.checkpointing)
    else:
        raise ValueError("Checkpoint Config has not been specified correctly.")

    if main_proc and cfg.visualization.visdom:
        visdom_logger = VisdomLogger(id=cfg.visualization.id,
                                     num_epochs=cfg.training.epochs)
    if main_proc and cfg.visualization.tensorboard:
        tensorboard_logger = TensorBoardLogger(
            id=cfg.visualization.id,
            log_dir=to_absolute_path(cfg.visualization.log_dir),
            log_params=cfg.visualization.log_params)

    if cfg.checkpointing.load_auto_checkpoint:
        latest_checkpoint = checkpoint_handler.find_latest_checkpoint()
        if latest_checkpoint:
            cfg.checkpointing.continue_from = latest_checkpoint

    if cfg.checkpointing.continue_from:  # Starting from previous model
        state = TrainingState.load_state(
            state_path=to_absolute_path(cfg.checkpointing.continue_from))
        model = state.model
        if cfg.training.finetune:
            state.init_finetune_states(cfg.training.epochs)

        if main_proc and cfg.visualization.visdom:  # Add previous scores to visdom graph
            visdom_logger.load_previous_values(state.epoch, state.results)
        if main_proc and cfg.visualization.tensorboard:  # Previous scores to tensorboard logs
            tensorboard_logger.load_previous_values(state.epoch, state.results)
    else:
        # Initialise new model training
        with open(to_absolute_path(cfg.data.labels_path)) as label_file:
            labels = json.load(label_file)

# #cấu hình của model trong file train_config.py dòng 51
# @dataclass
# class BiDirectionalConfig:
#     rnn_type: RNNType = RNNType.lstm  # Type of RNN to use in model
#     hidden_size: int = 1024  # Hidden size of RNN Layer
#     hidden_layers: int = 7  # Number of RNN layers
        if OmegaConf.get_type(cfg.model) is BiDirectionalConfig:
            model = DeepSpeech(
                rnn_hidden_size=cfg.model.hidden_size,
                nb_layers=cfg.model.hidden_layers,
                labels=labels,
                rnn_type=supported_rnns[cfg.model.rnn_type.value],
                audio_conf=cfg.data.spect,
                bidirectional=True)
        elif OmegaConf.get_type(cfg.model) is UniDirectionalConfig:
            model = DeepSpeech(
                rnn_hidden_size=cfg.model.hidden_size,
                nb_layers=cfg.model.hidden_layers,
                labels=labels,
                rnn_type=supported_rnns[cfg.model.rnn_type.value],
                audio_conf=cfg.data.spect,
                bidirectional=False,
                context=cfg.model.lookahead_context)
        else:
            raise ValueError("Model Config has not been specified correctly.")

        state = TrainingState(model=model)
        state.init_results_tracking(epochs=cfg.training.epochs)

    # Data setup
    evaluation_decoder = GreedyDecoder(
        model.labels)  # Decoder used for validation
    train_dataset = SpectrogramDataset(
        audio_conf=model.audio_conf,
        manifest_filepath=to_absolute_path(cfg.data.train_manifest),
        labels=model.labels,
        normalize=True,
        augmentation_conf=cfg.data.augmentation
    )  #cấu hình spect, ids=[[dòng 1],[dognf 2]..], lables_=dict
    test_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
                                      manifest_filepath=to_absolute_path(
                                          cfg.data.val_manifest),
                                      labels=model.labels,
                                      normalize=True)
    if not is_distributed:
        train_sampler = DSRandomSampler(
            dataset=train_dataset,
            batch_size=cfg.data.batch_size,
            start_index=state.training_step
        )  #DSRandomSampler để  chọn 1 bộ minibatch bất kì và xáo trộn nội dung trong minibatch
    else:
        train_sampler = DSElasticDistributedSampler(
            dataset=train_dataset,
            batch_size=cfg.data.batch_size,
            start_index=state.training_step)
    train_loader = AudioDataLoader(
        dataset=train_dataset,
        num_workers=cfg.data.num_workers,
        batch_sampler=train_sampler
    )  #AudioLoader có hàm collate_fn để xử lí 1 minibatch được chọn, trả ra cuối cùng là mảng có 835 phần tử(đối với FPT, VIVOS), mỗi phần tử của audio loader là 1 mảng gồm batch_size mẫu
    test_loader = AudioDataLoader(dataset=test_dataset,
                                  num_workers=cfg.data.num_workers,
                                  batch_size=cfg.data.batch_size)

    model = model.to(device)
    parameters = model.parameters()
    if OmegaConf.get_type(
            cfg.optim) is SGDConfig:  #mặc định ở dòng 8 trong train_config
        optimizer = torch.optim.SGD(parameters,
                                    lr=cfg.optim.learning_rate,
                                    momentum=cfg.optim.momentum,
                                    nesterov=True,
                                    weight_decay=cfg.optim.weight_decay)
    elif OmegaConf.get_type(cfg.optim) is AdamConfig:
        optimizer = torch.optim.AdamW(parameters,
                                      lr=cfg.optim.learning_rate,
                                      betas=cfg.optim.betas,
                                      eps=cfg.optim.eps,
                                      weight_decay=cfg.optim.weight_decay)
    else:
        raise ValueError("Optimizer has not been specified correctly.")

    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      enabled=not cfg.training.no_cuda,
                                      opt_level=cfg.apex.opt_level,
                                      loss_scale=cfg.apex.loss_scale)
    if state.optim_state is not None:
        optimizer.load_state_dict(state.optim_state)
    if state.amp_state is not None:
        amp.load_state_dict(state.amp_state)

    # Track states for optimizer/amp
    state.track_optim_state(optimizer)
    if not cfg.training.no_cuda:
        state.track_amp_state(amp)

    if is_distributed:
        model = DistributedDataParallel(model, device_ids=[device_id])
    #print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    criterion = CTCLoss()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()

    for epoch in range(
            state.epoch,
            cfg.training.epochs):  #1 epoch là duyệt hết cả csv của train
        model.train()
        end = time.time()
        start_epoch_time = time.time()
        state.set_epoch(epoch=epoch)
        train_sampler.set_epoch(epoch=epoch)
        train_sampler.reset_training_step(training_step=state.training_step)
        for i, (data) in enumerate(train_loader, start=state.training_step
                                   ):  #duyệt qua từng minibatch (gồm 32 mẫu)
            state.set_training_step(training_step=i)
            inputs, targets, input_percentages, target_sizes = data  #inputs[x][0] chứ spect thứ x trong batchsixe, input_percenttages: tỉ lệ độ dài câu từng câu nói trong minibatch/độ dài max, target: array [[...mã ascii]]
            input_sizes = input_percentages.mul_(int(inputs.size(3))).int(
            )  #tensor([699, 682, 656, 560, 553, 517, 514, 502, 464, 458, 423, 412, 406, 349, ...] laayss tỉ lệ X độ dài max để ra độ dài thực sự từng câu nói
            # measure data loading time
            data_time.update(time.time() - end)
            inputs = inputs.to(device)
            #đưa inputs gồm 32 mẫu qua mô hình học sâu, và kích thước thực sự từng câu nói (độ dài bước thời gian phổ)
            out, output_sizes = model(
                inputs, input_sizes
            )  #như out: 3 chiều, outputsize: 1 chiều : chứa kích thước mô hình dự đoán cho văn bản kq
            out = out.transpose(
                0, 1
            )  # TxNxH sau khi tranpose : out 3 chiều (bị đổi chiều 0 và 1)=> out (190x3x93)

            float_out = out.float()  # ensure float32 for loss
            loss = criterion(float_out, targets, output_sizes,
                             target_sizes).to(device)
            loss = loss / inputs.size(
                0
            )  # average the loss by minibatch, tổng loss chia cho số spect trong batch đó
            loss_value = loss.item()

            # Check to ensure valid loss was calculated
            valid_loss, error = check_loss(loss, loss_value)
            if valid_loss:
                optimizer.zero_grad()

                # compute gradient, SGD chuẩn hóa SGD cập nhật trọng số
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               cfg.optim.max_norm)
                optimizer.step()
            else:
                print(error)
                print('Skipping grad update')
                loss_value = 0

            state.avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                      (epoch + 1), (i + 1),
                      len(train_loader),
                      batch_time=batch_time,
                      data_time=data_time,
                      loss=losses))
            if main_proc and cfg.checkpointing.checkpoint_per_iteration:
                checkpoint_handler.save_iter_checkpoint_model(epoch=epoch,
                                                              i=i,
                                                              state=state)
            del loss, out, float_out

        state.avg_loss /= len(train_dataset)

        epoch_time = time.time() - start_epoch_time
        print('Training Summary Epoch: [{0}]\t'
              'Time taken (s): {epoch_time:.0f}\t'
              'Average Loss {loss:.3f}\t'.format(epoch + 1,
                                                 epoch_time=epoch_time,
                                                 loss=state.avg_loss))
        mylogg2er.info('Training Summary Epoch: [{0}]\t'
                       'Time taken (s): {epoch_time:.0f}\t'
                       'Average Loss {loss:.3f}\n'.format(
                           epoch + 1,
                           epoch_time=epoch_time,
                           loss=state.avg_loss))
        file_object = open('/root/epoch.log', 'a')
        file_object.write('Training Summary Epoch: [{0}]\t'
                          'Time taken (s): {epoch_time:.0f}\t'
                          'Average Loss {loss:.3f}\n'.format(
                              epoch + 1,
                              epoch_time=epoch_time,
                              loss=state.avg_loss))
        file_object.close()
        with torch.no_grad():
            wer, cer, output_data, wer2, cer2 = run_evaluation(
                test_loader=test_loader,
                device=device,
                model=model,
                decoder=evaluation_decoder,
                target_decoder=evaluation_decoder)

        state.add_results(epoch=epoch,
                          loss_result=state.avg_loss,
                          wer_result=wer,
                          cer_result=cer)

        print('Validation Summary Epoch: [{0}]\t'
              'Average WER {wer:.3f}\t'
              'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer))
        # mylogg2er.info('Validation Summary Epoch: [{0}]\t'
        #       'Average WER {wer:.3f}\t'
        #       'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer))
        file_object = open('/root/epoch.log', 'a')
        file_object.write('Validation Summary Epoch: [{0}]\t'
                          'Average WER {wer:.3f}\t'
                          'Average CER {cer:.3f}\t'.format(epoch + 1,
                                                           wer=wer,
                                                           cer=cer))
        file_object.write('Validation Summary Epoch: [{0}]\t'
                          'Average WER2 {wer:.3f}\t'
                          'Average CER2 {cer:.3f}\n'.format(epoch + 1,
                                                            wer=wer2,
                                                            cer=cer2))
        file_object.close()
        if main_proc and cfg.visualization.visdom:
            visdom_logger.update(epoch, state.result_state)
        if main_proc and cfg.visualization.tensorboard:
            tensorboard_logger.update(epoch, state.result_state,
                                      model.named_parameters())

        if main_proc and cfg.checkpointing.checkpoint:  # Save epoch checkpoint
            checkpoint_handler.save_checkpoint_model(epoch=epoch, state=state)
        # anneal lr
        for g in optimizer.param_groups:
            g['lr'] = g['lr'] / cfg.optim.learning_anneal
        print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr']))
        file_object = open('/root/epoch.log', 'a')
        file_object.write(
            'Learning rate annealed to: {lr:.6f}\n'.format(lr=g['lr']))
        file_object.close()
        try:
            #  print('Training Summary Epoch: [{0}]\t'
            #   'Time taken (s): {epoch_time:.0f}\t'
            #   'Average Loss {loss:.3f}\t'.format(epoch + 1, epoch_time=epoch_time, loss=state.avg_loss))/////////
            note = "Đổi tham số train_config: type: rnn.gru epochs: int = 50, batch_size: int = 30, hidden_size: int = 1600, hidden_layers: int = 7, file train_manifest: vinfptunk_train.csv, vinfptunk_dev.csv",
            sendReport(epoch + 1, '{:.3f}'.format(epoch_time),
                       '{:.3f}'.format(state.avg_loss), '{:.3f}'.format(wer),
                       '{:.3f}'.format(cer), "{:.6f}".format(g['lr']), note)
        except Exception as esss:
            print('Error :', esss)
        if main_proc and (state.best_wer is None or state.best_wer > wer):
            checkpoint_handler.save_best_model(epoch=epoch, state=state)
            state.set_best_wer(wer)
            state.reset_avg_loss()
        state.reset_training_step()  # Reset training step for next epoch
예제 #12
0
    def __init__(self,
                 labels: List,
                 model_cfg: Union[UniDirectionalConfig, BiDirectionalConfig],
                 precision: int,
                 optim_cfg: Union[AdamConfig, SGDConfig],
                 spect_cfg: SpectConfig
                 ):
        super().__init__()
        self.save_hyperparameters()
        self.model_cfg = model_cfg
        self.precision = precision
        self.optim_cfg = optim_cfg
        self.spect_cfg = spect_cfg
        self.bidirectional = True if OmegaConf.get_type(model_cfg) is BiDirectionalConfig else False

        print('OMEGA CONF \n', model_cfg)
        self.labels = labels
        num_classes = len(self.labels)

        self.conv = MaskConv(nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)),
            nn.BatchNorm2d(32),
            nn.Hardtanh(0, 20, inplace=True),
            nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)),
            nn.BatchNorm2d(32),
            nn.Hardtanh(0, 20, inplace=True)
        ))
        # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1
        rnn_input_size = int(math.floor((self.spect_cfg.sample_rate * self.spect_cfg.window_size) / 2) + 1)
        rnn_input_size = int(math.floor(rnn_input_size + 2 * 20 - 41) / 2 + 1)
        rnn_input_size = int(math.floor(rnn_input_size + 2 * 10 - 21) / 2 + 1)
        rnn_input_size *= 32

        self.rnns = nn.Sequential(
            BatchRNN(
                input_size=rnn_input_size,
                hidden_size=self.model_cfg.hidden_size,
                rnn_type=self.model_cfg.rnn_type.value,
                bidirectional=self.bidirectional,
                batch_norm=False
            ),
            *(
                BatchRNN(
                    input_size=self.model_cfg.hidden_size,
                    hidden_size=self.model_cfg.hidden_size,
                    rnn_type=self.model_cfg.rnn_type.value,
                    bidirectional=self.bidirectional
                ) for x in range(self.model_cfg.hidden_layers - 1)
            )
        )

        self.lookahead = nn.Sequential(
            # consider adding batch norm?
            Lookahead(self.model_cfg.hidden_size, context=self.model_cfg.lookahead_context),
            nn.Hardtanh(0, 20, inplace=True)
        ) if not self.bidirectional else None

        fully_connected = nn.Sequential(
            nn.BatchNorm1d(self.model_cfg.hidden_size),
            nn.Linear(self.model_cfg.hidden_size, num_classes, bias=False)
        )
        self.fc = nn.Sequential(
            SequenceWise(fully_connected),
        )
        self.inference_softmax = InferenceBatchSoftmax()
        # self.criterion = CTCLoss(blank=self.labels.index('_'), reduction='sum', zero_infinity=True)
        self.criterion = LWLRAP(precision=self.precision)
        self.evaluation_decoder = GreedyDecoder(self.labels)  # Decoder used for validation
        # self.wer = WordErrorRate(
        #     decoder=self.evaluation_decoder,
        #     target_decoder=self.evaluation_decoder
        # )
        # self.cer = CharErrorRate(
        #     decoder=self.evaluation_decoder,
        #     target_decoder=self.evaluation_decoder
        # )
        self.lwlrap = LWLRAP(precision=self.precision)
        self.loss = RMSELoss()
        self.softmax = nn.Softmax(dim=-1)
        # self.loss = nn.BCEWithLogitsLoss()
        self.sigmoid = nn.Sigmoid()
예제 #13
0
class DeepSpeech(pl.LightningModule):
    def __init__(self, labels: List, model_cfg: Union[UniDirectionalConfig,
                                                      BiDirectionalConfig],
                 precision: int, optim_cfg: Union[AdamConfig, SGDConfig],
                 spect_cfg: SpectConfig):
        super().__init__()
        self.save_hyperparameters()
        self.model_cfg = model_cfg
        self.precision = precision
        self.optim_cfg = optim_cfg
        self.spect_cfg = spect_cfg
        self.bidirectional = True if OmegaConf.get_type(
            model_cfg) is BiDirectionalConfig else False

        self.labels = labels
        num_classes = len(self.labels)

        self.conv = MaskConv(
            nn.Sequential(
                nn.Conv2d(1,
                          32,
                          kernel_size=(41, 11),
                          stride=(2, 2),
                          padding=(20, 5)), nn.BatchNorm2d(32),
                nn.Hardtanh(0, 20, inplace=True),
                nn.Conv2d(32,
                          32,
                          kernel_size=(21, 11),
                          stride=(2, 1),
                          padding=(10, 5)), nn.BatchNorm2d(32),
                nn.Hardtanh(0, 20, inplace=True)))
        # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1
        rnn_input_size = int(
            math.floor((self.spect_cfg.sample_rate *
                        self.spect_cfg.window_size) / 2) + 1)
        rnn_input_size = int(math.floor(rnn_input_size + 2 * 20 - 41) / 2 + 1)
        rnn_input_size = int(math.floor(rnn_input_size + 2 * 10 - 21) / 2 + 1)
        rnn_input_size *= 32

        self.rnns = nn.Sequential(
            BatchRNN(input_size=rnn_input_size,
                     hidden_size=self.model_cfg.hidden_size,
                     rnn_type=self.model_cfg.rnn_type.value,
                     bidirectional=self.bidirectional,
                     batch_norm=False),
            *(BatchRNN(input_size=self.model_cfg.hidden_size,
                       hidden_size=self.model_cfg.hidden_size,
                       rnn_type=self.model_cfg.rnn_type.value,
                       bidirectional=self.bidirectional)
              for x in range(self.model_cfg.hidden_layers - 1)))

        self.lookahead = nn.Sequential(
            # consider adding batch norm?
            Lookahead(self.model_cfg.hidden_size,
                      context=self.model_cfg.lookahead_context),
            nn.Hardtanh(0, 20,
                        inplace=True)) if not self.bidirectional else None

        fully_connected = nn.Sequential(
            nn.BatchNorm1d(self.model_cfg.hidden_size),
            nn.Linear(self.model_cfg.hidden_size, num_classes, bias=False))
        self.fc = nn.Sequential(SequenceWise(fully_connected), )
        self.inference_softmax = InferenceBatchSoftmax()
        self.criterion = CTCLoss(blank=self.labels.index('_'),
                                 reduction='sum',
                                 zero_infinity=True)
        self.evaluation_decoder = GreedyDecoder(
            self.labels)  # Decoder used for validation
        self.wer = WordErrorRate(decoder=self.evaluation_decoder,
                                 target_decoder=self.evaluation_decoder)
        self.cer = CharErrorRate(decoder=self.evaluation_decoder,
                                 target_decoder=self.evaluation_decoder)

    def forward(self, x, lengths):
        lengths = lengths.cpu().int()
        output_lengths = self.get_seq_lens(lengths)
        x, _ = self.conv(x, output_lengths)

        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2],
                   sizes[3])  # Collapse feature dimension
        x = x.transpose(1, 2).transpose(0, 1).contiguous()  # TxNxH

        for rnn in self.rnns:
            x = rnn(x, output_lengths)

        if not self.bidirectional:  # no need for lookahead layer in bidirectional
            x = self.lookahead(x)

        x = self.fc(x)
        x = x.transpose(0, 1)
        # identity in training mode, softmax in eval mode
        x = self.inference_softmax(x)
        return x, output_lengths

    def training_step(self, batch, batch_idx):
        inputs, targets, input_percentages, target_sizes = batch
        input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
        out, output_sizes = self(inputs, input_sizes)
        out = out.transpose(0, 1)  # TxNxH
        out = out.log_softmax(-1)

        loss = self.criterion(out, targets, output_sizes, target_sizes)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, targets, input_percentages, target_sizes = batch
        input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
        inputs = inputs.to(self.device)
        with autocast(enabled=self.precision == 16):
            out, output_sizes = self(inputs, input_sizes)
        decoded_output, _ = self.evaluation_decoder.decode(out, output_sizes)
        self.wer(preds=out,
                 preds_sizes=output_sizes,
                 targets=targets,
                 target_sizes=target_sizes)
        self.cer(preds=out,
                 preds_sizes=output_sizes,
                 targets=targets,
                 target_sizes=target_sizes)
        self.log('wer', self.wer.compute(), prog_bar=True, on_epoch=True)
        self.log('cer', self.cer.compute(), prog_bar=True, on_epoch=True)

    def configure_optimizers(self):
        if OmegaConf.get_type(self.optim_cfg) is SGDConfig:
            optimizer = torch.optim.SGD(
                params=self.parameters(),
                lr=self.optim_cfg.learning_rate,
                momentum=self.optim_cfg.momentum,
                nesterov=True,
                weight_decay=self.optim_cfg.weight_decay)
        elif OmegaConf.get_type(self.optim_cfg) is AdamConfig:
            optimizer = torch.optim.AdamW(
                params=self.parameters(),
                lr=self.optim_cfg.learning_rate,
                betas=self.optim_cfg.betas,
                eps=self.optim_cfg.eps,
                weight_decay=self.optim_cfg.weight_decay)
        else:
            raise ValueError("Optimizer has not been specified correctly.")

        scheduler = torch.optim.lr_scheduler.ExponentialLR(
            optimizer=optimizer, gamma=self.optim_cfg.learning_anneal)
        return [optimizer], [scheduler]

    def get_seq_lens(self, input_length):
        """
        Given a 1D Tensor or Variable containing integer sequence lengths, return a 1D tensor or variable
        containing the size sequences that will be output by the network.
        :param input_length: 1D Tensor
        :return: 1D Tensor scaled by model
        """
        seq_len = input_length
        for m in self.conv.modules():
            if type(m) == nn.modules.conv.Conv2d:
                seq_len = ((seq_len + 2 * m.padding[1] - m.dilation[1] *
                            (m.kernel_size[1] - 1) - 1) // m.stride[1] + 1)
        return seq_len.int()
예제 #14
0
def run_quantsim_evaluation(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    import deepspeech_pytorch.model

    def wrapped_forward_function(self, x, lengths=None):
        if lengths is None:
            lengths = torch.IntTensor([_x.shape[0] for _x in x])
        return self.infer(x, lengths)

    deepspeech_pytorch.model.DeepSpeech.infer = deepspeech_pytorch.model.DeepSpeech.forward
    deepspeech_pytorch.model.DeepSpeech.forward = wrapped_forward_function

    model = load_model(device=device,
                       model_path=args.model_path,
                       use_half=False)

    decoder = load_decoder(labels=model.labels, cfg=LMConfig)

    target_decoder = GreedyDecoder(model.labels,
                                   blank_index=model.labels.index('_'))

    def eval_func(model, iterations=None, device=device):
        test_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
                                          manifest_filepath=args.test_manifest,
                                          labels=model.labels,
                                          normalize=True)

        if iterations is not None:
            test_dataset.size = iterations

        test_loader = AudioDataLoader(test_dataset,
                                      batch_size=args.batch_size,
                                      num_workers=args.num_workers)

        wer, cer, output_data = run_evaluation(test_loader=test_loader,
                                               device=device,
                                               model=model,
                                               decoder=decoder,
                                               target_decoder=target_decoder,
                                               save_output=False,
                                               verbose=True,
                                               use_half=False)
        return wer, cer, output_data

    quant_scheme = QuantScheme.post_training_tf_enhanced

    sim = QuantizationSimModel(model.cpu(),
                               input_shapes=tuple([1, 1, 161, 500]),
                               quant_scheme=quant_scheme,
                               default_param_bw=args.default_param_bw,
                               default_output_bw=args.default_output_bw,
                               config_file=args.quantsim_config_file)

    manually_configure_quant_ops(sim)

    sim.model.to(device)
    sim.compute_encodings(eval_func,
                          forward_pass_callback_args=args.encodings_iterations)

    wer, cer, output_data = eval_func(sim.model, None)
    print('Average WER {:.4f}'.format(wer))
예제 #15
0
def train(cfg):

    # 결과를 저장하기 위한 txt파일 초기화
    with open(
            "/home/jhjeong/jiho_deep/deepspeech.pytorch/jiho_result/result.txt",
            "w") as ff:
        ff.write("학습 시작! \n")

    # Set seeds for determinism
    torch.manual_seed(cfg.training.seed)
    torch.cuda.manual_seed_all(cfg.training.seed)
    np.random.seed(cfg.training.seed)
    random.seed(cfg.training.seed)

    main_proc = True
    device = torch.device("cpu" if cfg.training.no_cuda else "cuda")

    is_distributed = os.environ.get(
        "LOCAL_RANK")  # If local rank exists, distributed env

    if is_distributed:
        # when using NCCL, on failures, surviving nodes will deadlock on NCCL ops
        # because NCCL uses a spin-lock on the device. Set this env var and
        # to enable a watchdog thread that will destroy stale NCCL communicators
        os.environ["NCCL_BLOCKING_WAIT"] = "1"

        device_id = int(os.environ["LOCAL_RANK"])
        torch.cuda.set_device(device_id)
        print(f"Setting CUDA Device to {device_id}")

        dist.init_process_group(backend=cfg.training.dist_backend)
        main_proc = device_id == 0  # Main process handles saving of models and reporting

    checkpoint_handler = CheckpointHandler(
        save_folder=to_absolute_path(cfg.checkpointing.save_folder),
        best_val_model_name=cfg.checkpointing.best_val_model_name,
        checkpoint_per_iteration=cfg.checkpointing.checkpoint_per_iteration,
        save_n_recent_models=cfg.checkpointing.save_n_recent_models)

    #visdom 사용할건지 tensorboard 사용할건지
    if main_proc and cfg.visualization.visdom:
        visdom_logger = VisdomLogger(id=cfg.visualization.id,
                                     num_epochs=cfg.training.epochs)
    if main_proc and cfg.visualization.tensorboard:
        tensorboard_logger = TensorBoardLogger(
            id=cfg.visualization.id,
            log_dir=to_absolute_path(cfg.visualization.log_dir),
            log_params=cfg.visualization.log_params)

    if cfg.checkpointing.load_auto_checkpoint:
        latest_checkpoint = checkpoint_handler.find_latest_checkpoint()
        if latest_checkpoint:
            cfg.checkpointing.continue_from = latest_checkpoint

    # 여기서 부터
    if cfg.checkpointing.continue_from:  # Starting from previous model
        state = TrainingState.load_state(
            state_path=to_absolute_path(cfg.checkpointing.continue_from))
        model = state.model
        if cfg.training.finetune:
            state.init_finetune_states(cfg.training.epochs)

        if main_proc and cfg.visualization.visdom:  # Add previous scores to visdom graph
            visdom_logger.load_previous_values(state.epoch, state.results)
        if main_proc and cfg.visualization.tensorboard:  # Previous scores to tensorboard logs
            tensorboard_logger.load_previous_values(state.epoch, state.results)
    else:
        # Initialise new model training
        with open(to_absolute_path(cfg.data.labels_path)) as label_file:
            labels = json.load(label_file)  # label(a,b,c ...)

        audio_conf = dict(sample_rate=cfg.data.sample_rate,
                          window_size=cfg.data.window_size,
                          window_stride=cfg.data.window_stride,
                          window=cfg.data.window)
        if cfg.augmentation.noise_dir:
            audio_conf += dict(noise_dir=to_absolute_path(
                cfg.augmentation.noise_dir),
                               noise_prob=cfg.augmentation.noise_prob,
                               noise_levels=(cfg.augmentation.noise_min,
                                             cfg.augmentation.noise_max))

        rnn_type = cfg.model.rnn_type.lower()
        assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"

        # DeepSpeech 모델을 생성
        model = DeepSpeech(rnn_hidden_size=cfg.model.hidden_size,
                           nb_layers=cfg.model.hidden_layers,
                           labels=labels,
                           rnn_type=supported_rnns[rnn_type],
                           audio_conf=audio_conf,
                           bidirectional=cfg.model.bidirectional)

        state = TrainingState(model=model)
        state.init_results_tracking(epochs=cfg.training.epochs)

    # Data setup
    evaluation_decoder = GreedyDecoder(
        model.labels)  # Decoder used for validation

    # Data path 정리
    train_dataset = SpectrogramDataset(
        audio_conf=model.audio_conf,
        manifest_filepath=to_absolute_path(cfg.data.train_manifest),
        labels=model.labels,
        normalize=True,
        speed_volume_perturb=cfg.augmentation.speed_volume_perturb,
        spec_augment=cfg.augmentation.spec_augment)

    test_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
                                      manifest_filepath=to_absolute_path(
                                          cfg.data.val_manifest),
                                      labels=model.labels,
                                      normalize=True,
                                      speed_volume_perturb=False,
                                      spec_augment=False)

    if not is_distributed:
        train_sampler = DSRandomSampler(dataset=train_dataset,
                                        batch_size=cfg.data.batch_size,
                                        start_index=state.training_step)
    else:
        train_sampler = DSElasticDistributedSampler(
            dataset=train_dataset,
            batch_size=cfg.data.batch_size,
            start_index=state.training_step)

    # data load 하는 부분
    train_loader = AudioDataLoader(dataset=train_dataset,
                                   num_workers=cfg.data.num_workers,
                                   batch_sampler=train_sampler)
    test_loader = AudioDataLoader(dataset=test_dataset,
                                  num_workers=cfg.data.num_workers,
                                  batch_size=cfg.data.batch_size)

    model = model.to(device)
    parameters = model.parameters()

    if cfg.optimizer.adam:
        optimizer = torch.optim.AdamW(parameters,
                                      lr=cfg.optimizer.learning_rate,
                                      betas=cfg.optimizer.betas,
                                      eps=cfg.optimizer.eps,
                                      weight_decay=cfg.optimizer.weight_decay)
    else:
        optimizer = torch.optim.SGD(parameters,
                                    lr=cfg.optimizer.learning_rate,
                                    momentum=cfg.optimizer.momentum,
                                    nesterov=True,
                                    weight_decay=cfg.optimizer.weight_decay)

    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=cfg.apex.opt_level,
                                      loss_scale=cfg.apex.loss_scale)

    if state.optim_state is not None:
        optimizer.load_state_dict(state.optim_state)
        amp.load_state_dict(state.amp_state)

    # Track states for optimizer/amp
    state.track_optim_state(optimizer)
    state.track_amp_state(amp)

    if is_distributed:
        model = DistributedDataParallel(model, device_ids=[device_id])

    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    criterion = CTCLoss()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()

    for epoch in range(state.epoch, cfg.training.epochs):
        model.train()
        end = time.time()
        start_epoch_time = time.time()
        state.set_epoch(epoch=epoch)
        train_sampler.set_epoch(epoch=epoch)
        train_sampler.reset_training_step(training_step=state.training_step)

        #train data있는거 가져다 사용하겠다.
        for i, (data) in enumerate(train_loader, start=state.training_step):
            state.set_training_step(training_step=i)
            inputs, targets, input_percentages, target_sizes = data
            input_sizes = input_percentages.mul_(int(inputs.size(3))).int()

            # measure data loading time
            data_time.update(time.time() - end)
            inputs = inputs.to(device)

            out, output_sizes = model(inputs, input_sizes)
            out = out.transpose(0, 1)  # TxNxH

            float_out = out.float()  # ensure float32 for loss
            loss = criterion(float_out, targets, output_sizes,
                             target_sizes).to(device)
            loss = loss / inputs.size(0)  # average the loss by minibatch
            loss_value = loss.item()

            # Check to ensure valid loss was calculated
            valid_loss, error = check_loss(loss, loss_value)
            if valid_loss:
                optimizer.zero_grad()

                # compute gradient
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               cfg.optimizer.max_norm)
                optimizer.step()
            else:
                print(error)
                print('Skipping grad update')
                loss_value = 0

            state.avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                      (epoch + 1), (i + 1),
                      len(train_loader),
                      batch_time=batch_time,
                      data_time=data_time,
                      loss=losses))

            if main_proc and cfg.checkpointing.checkpoint_per_iteration:
                checkpoint_handler.save_iter_checkpoint_model(epoch=epoch,
                                                              i=i,
                                                              state=state)
            del loss, out, float_out

        state.avg_loss /= len(train_dataset)

        epoch_time = time.time() - start_epoch_time
        print('Training Summary Epoch: [{0}]\t'
              'Time taken (s): {epoch_time:.0f}\t'
              'Average Loss {loss:.3f}\t'.format(epoch + 1,
                                                 epoch_time=epoch_time,
                                                 loss=state.avg_loss))

        with open(
                "/home/jhjeong/jiho_deep/deepspeech.pytorch/jiho_result/result.txt",
                "a") as ff:
            ff.write("\n")
            ff.write("train -> ")
            ff.write("epoch : ")
            ff.write(str(epoch + 1))
            ff.write(" loss : ")
            ff.write(str(state.avg_loss))
            ff.write("\n")

        with torch.no_grad():
            wer, cer, output_data = evaluate(test_loader=test_loader,
                                             device=device,
                                             model=model,
                                             decoder=evaluation_decoder,
                                             target_decoder=evaluation_decoder)

        state.add_results(epoch=epoch,
                          loss_result=state.avg_loss,
                          wer_result=wer,
                          cer_result=cer)

        print('Validation Summary Epoch: [{0}]\t'
              'Average CER {cer:.3f}\t'.format(epoch + 1, cer=cer))

        with open(
                "/home/jhjeong/jiho_deep/deepspeech.pytorch/jiho_result/result.txt",
                "a") as ff:
            ff.write("\n")
            ff.write("val -> ")
            ff.write("epoch : ")
            ff.write(str(epoch + 1))
            ff.write(" cer : ")
            ff.write(str(cer))
            ff.write("\n")

        # 텐서보드에 업데이트함
        if main_proc and cfg.visualization.visdom:
            visdom_logger.update(epoch, state.result_state)
        if main_proc and cfg.visualization.tensorboard:
            tensorboard_logger.update(epoch, state.result_state,
                                      model.named_parameters())

        if main_proc and cfg.checkpointing.checkpoint:  # Save epoch checkpoint
            checkpoint_handler.save_checkpoint_model(epoch=epoch, state=state)
        # anneal lr
        for g in optimizer.param_groups:
            g['lr'] = g['lr'] / cfg.optimizer.learning_anneal
        print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr']))

        if main_proc and (state.best_wer is None or state.best_wer > wer):
            checkpoint_handler.save_best_model(epoch=epoch, state=state)
            state.set_best_wer(wer)
            state.reset_avg_loss()
        state.reset_training_step()  # Reset training step for next epoch
예제 #16
0
if __name__ == '__main__':
    args = parser.parse_args()
    torch.set_grad_enabled(False)
    device = torch.device("cuda" if args.cuda else "cpu")
    model = load_model(device, args.model_path, args.half)

    decoder = load_decoder(decoder_type=args.decoder,
                           labels=model.labels,
                           lm_path=args.lm_path,
                           alpha=args.alpha,
                           beta=args.beta,
                           cutoff_top_n=args.cutoff_top_n,
                           cutoff_prob=args.cutoff_prob,
                           beam_width=args.beam_width,
                           lm_workers=args.lm_workers)
    target_decoder = GreedyDecoder(model.labels,
                                   blank_index=model.labels.index('_'))
    test_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
                                      manifest_filepath=args.test_manifest,
                                      labels=model.labels,
                                      normalize=True)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers)
    wer, cer, output_data = evaluate(test_loader=test_loader,
                                     device=device,
                                     model=model,
                                     decoder=decoder,
                                     target_decoder=target_decoder,
                                     save_output=args.save_output,
                                     verbose=args.verbose,
                                     half=args.half)
예제 #17
0
def run_evaluation(test_loader,
                   device,
                   model,
                   decoder,
                   target_decoder,
                   save_output=None,
                   verbose=False,
                   use_half=False):
    model.eval()
    total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0
    total_cer2, total_wer2, num_tokens2, num_chars2 = 0, 0, 0, 0

    output_data = []
    for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)):
        inputs, targets, input_percentages, target_sizes = data
        input_sizes = input_percentages.mul_(int(inputs.size(3))).int(
        )  #độ dài 1 dòng trong spect của mẫu, input_sizes là 32 mẫu
        inputs = inputs.to(device)
        if use_half:
            inputs = inputs.half()  #không thay đổi nhiều
        # unflatten targets
        split_targets = []
        offset = 0
        for size in target_sizes:
            split_targets.append(targets[offset:offset + size])
            offset += size

        out, output_sizes = model(inputs, input_sizes)

        decoded_output, _ = decoder.decode(out, output_sizes)
        target_strings = target_decoder.convert_to_strings(split_targets)

        if save_output is not None:
            # add output to data array, and continue
            output_data.append((out.cpu(), output_sizes, target_strings))
    #     for x in range(len(target_strings)):
    #         transcript, reference = decoded_output[x][0], target_strings[x][0]
    #         wer_inst = decoder.wer(transcript, reference)
    #         cer_inst = decoder.cer(transcript, reference)
    #         total_wer += wer_inst
    #         total_cer += cer_inst
    #         num_tokens += len(reference.split())
    #         num_chars += len(reference.replace(' ', ''))
    #         if verbose:
    #             print("Ref:", reference.lower())
    #             print("Hyp:", transcript.lower())
    #             print("WER:", float(wer_inst) / len(reference.split()),
    #                   "CER:", float(cer_inst) / len(reference.replace(' ', '')), "\n")
    # wer = float(total_wer) / num_tokens
    # cer = float(total_cer) / num_chars

    ############
        decoder2 = GreedyDecoder(labels=model.labels,
                                 blank_index=model.labels.index('_'))
        old_out, out_offsets = decoder2.decode(out, output_sizes)
        for x in range(len(target_strings)):
            transcript, reference = decoded_output[x][0], target_strings[x][0]
            wer_inst = decoder.wer(transcript, reference)
            cer_inst = decoder.cer(transcript, reference)
            total_wer += wer_inst
            total_cer += cer_inst
            num_tokens += len(reference.split())
            num_chars += len(reference.replace(' ', ''))
            if verbose:
                print("TRUTH :", reference.lower())
                print("Beam  :", transcript.lower())
                print("WER:",
                      float(wer_inst) / len(reference.split()), "CER:",
                      float(cer_inst) / len(reference.replace(' ', '')))

            transcript2 = old_out[x][0]
            wer_inst2 = decoder2.wer(transcript2, reference)
            cer_inst2 = decoder2.cer(transcript2, reference)
            total_wer2 += wer_inst2
            total_cer2 += cer_inst2
            num_tokens2 += len(reference.split())
            num_chars2 += len(reference.replace(' ', ''))
            if verbose:
                print("Greedy:", transcript2.lower())
                print("WER2:",
                      float(wer_inst2) / len(reference.split()), "CER2:",
                      float(cer_inst2) / len(reference.replace(' ', '')), "\n")
            # if(total_wer!=total_wer2):
            #     print("BUG HERE")
    wer = float(total_wer) / num_tokens
    cer = float(total_cer) / num_chars
    wer2 = float(total_wer2) / num_tokens2
    cer2 = float(total_cer2) / num_chars2
    ##########

    # for x in range(len(target_strings)):
    #     transcript2=old_out[x][0]
    #     wer_inst2 = decoder2.wer(transcript2, reference)
    #     cer_inst2 = decoder2.cer(transcript2, reference)
    #     total_wer2 += wer_inst2
    #     total_cer2 += cer_inst2
    #     num_tokens2 += len(reference.split())
    #     num_chars2 += len(reference.replace(' ', ''))
    #     if verbose:
    #         print("Old:",transcript2.lower())
    #         print("WER2:", float(wer_inst2) / len(reference.split()),
    #             "CER2:", float(cer_inst2) / len(reference.replace(' ', '')), "\n")

    ################
    return wer * 100, cer * 100, output_data, wer2 * 100, cer * 100