Exemplo n.º 1
0
def save(model, ema_model, optimizer, epoch, output_dir, optim_level):
    """
    Saves model checkpoint
    Args:
        model: model
        ema_model: model with exponential averages of weights
        optimizer: optimizer
        epoch: epoch of model training
        output_dir: path to save model checkpoint
    """
    out_fpath = os.path.join(output_dir, f"Jasper_epoch{epoch}_checkpoint.pt")
    print_once(f"Saving {out_fpath}...")

    if torch.distributed.is_initialized():
        torch.distributed.barrier()
        rank = torch.distributed.get_rank()
    else:
        rank = 0

    if rank == 0:
        checkpoint = {
            'epoch': epoch,
            'state_dict': getattr(model, 'module', model).state_dict(),
            'optimizer': optimizer.state_dict(),
            'amp': amp.state_dict() if optim_level > 0 else None,
        }
        if ema_model is not None:
            checkpoint['ema_state_dict'] = getattr(ema_model, 'module',
                                                   ema_model).state_dict()
        torch.save(checkpoint, out_fpath)

    print_once('Saved.')
Exemplo n.º 2
0
    def eval(model, name=''):
        """Evaluates model on evaluation dataset
        """
        with torch.no_grad():
            _global_var_dict = {
                'EvalLoss': [],
                'predictions': [],
                'transcripts': [],
            }
            eval_dataloader = data_layer_eval.data_iterator
            for data in eval_dataloader:
                tensors = []
                for d in data:
                    if isinstance(d, torch.Tensor):
                        tensors.append(d.cuda())
                    else:
                        tensors.append(d)
                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors

                model.eval()
                if optim_level == 1:
                    with amp.disable_casts():
                        t_processed_signal_e, t_processed_sig_length_e = audio_preprocessor(
                            t_audio_signal_e, t_a_sig_length_e)
                else:
                    t_processed_signal_e, t_processed_sig_length_e = audio_preprocessor(
                        t_audio_signal_e, t_a_sig_length_e)
                if jasper_encoder.use_conv_mask:
                    t_log_probs_e, t_encoded_len_e = model.forward(
                        (t_processed_signal_e, t_processed_sig_length_e))
                else:
                    t_log_probs_e = model.forward(t_processed_signal_e)
                t_loss_e = ctc_loss(log_probs=t_log_probs_e,
                                    targets=t_transcript_e,
                                    input_length=t_encoded_len_e,
                                    target_length=t_transcript_len_e)
                t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)

                values_dict = dict(loss=[t_loss_e],
                                   predictions=[t_predictions_e],
                                   transcript=[t_transcript_e],
                                   transcript_length=[t_transcript_len_e])
                process_evaluation_batch(values_dict,
                                         _global_var_dict,
                                         labels=labels)

            # final aggregation across all workers and minibatches) and logging of results
            wer, eloss = process_evaluation_epoch(_global_var_dict)

            if name != '':
                name = '_' + name

            print_once(f"==========>>>>>>Evaluation{name} Loss: {eloss}\n")
            print_once(f"==========>>>>>>Evaluation{name} WER: {wer}\n")
Exemplo n.º 3
0
def save(model, optimizer, epoch, output_dir):
    """
    Saves model checkpoint
    Args:
        model: model 
        optimizer: optimizer
        epoch: epoch of model training
        output_dir: path to save model checkpoint
    """
    class_name = model.__class__.__name__
    unix_time = time.time()
    file_name = "{0}_{1}-epoch-{2}.pt".format(class_name, unix_time, epoch)
    print_once("Saving module {0} in {1}".format(class_name, os.path.join(output_dir, file_name)))
    if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)):
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        save_checkpoint={
                        'epoch': epoch, 
                        'state_dict': model_to_save.state_dict(), 
                        'optimizer': optimizer.state_dict()
                        }

        torch.save(save_checkpoint, os.path.join(output_dir, file_name))
    print_once('Saved.')
Exemplo n.º 4
0
    def eval():
        """Evaluates model on evaluation dataset
        """
        with torch.no_grad():
            _global_var_dict = {
                'EvalLoss': [],
                'predictions': [],
                'transcripts': [],
            }
            eval_dataloader = data_layer_eval.data_iterator
            for data in eval_dataloader:
                tensors = []
                for d in data:
                    if isinstance(d, torch.Tensor):
                        tensors.append(d.cuda())
                    else:
                        tensors.append(d)
                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors

                model.eval()
                t_log_probs_e, t_encoded_len_e = model(x=(t_audio_signal_e, t_a_sig_length_e))
                t_loss_e = ctc_loss(log_probs=t_log_probs_e, targets=t_transcript_e, input_length=t_encoded_len_e, target_length=t_transcript_len_e)
                t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)

                values_dict = dict(
                    loss=[t_loss_e],
                    predictions=[t_predictions_e],
                    transcript=[t_transcript_e],
                    transcript_length=[t_transcript_len_e]
                )
                process_evaluation_batch(values_dict, _global_var_dict, labels=labels)

            # final aggregation across all workers and minibatches) and logging of results
            wer, eloss = process_evaluation_epoch(_global_var_dict)

            print_once("==========>>>>>>Evaluation Loss: {0}\n".format(eloss))
            print_once("==========>>>>>>Evaluation WER: {0}\n".format(wer))
Exemplo n.º 5
0
def train(
        data_layer, 
        data_layer_eval,
        model,
        ctc_loss, 
        greedy_decoder, 
        optimizer, 
        optim_level, 
        labels, 
        multi_gpu, 
        args,
        fn_lr_policy=None):
    """Trains model
    Args:
        data_layer: training data layer
        data_layer_eval: evaluation data layer
        model: model ( encapsulates data processing, encoder, decoder)
        ctc_loss: loss function
        greedy_decoder: greedy ctc decoder
        optimizer: optimizer
        optim_level: AMP optimization level
        labels: list of output labels
        multi_gpu: true if multi gpu training
        args: script input argument list
        fn_lr_policy: learning rate adjustment function
    """
    def eval():
        """Evaluates model on evaluation dataset
        """
        with torch.no_grad():
            _global_var_dict = {
                'EvalLoss': [],
                'predictions': [],
                'transcripts': [],
            }
            eval_dataloader = data_layer_eval.data_iterator
            for data in eval_dataloader:
                tensors = []
                for d in data:
                    if isinstance(d, torch.Tensor):
                        tensors.append(d.cuda())
                    else:
                        tensors.append(d)
                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors

                model.eval()
                t_log_probs_e, t_encoded_len_e = model(x=(t_audio_signal_e, t_a_sig_length_e))
                t_loss_e = ctc_loss(log_probs=t_log_probs_e, targets=t_transcript_e, input_length=t_encoded_len_e, target_length=t_transcript_len_e)
                t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)

                values_dict = dict(
                    loss=[t_loss_e],
                    predictions=[t_predictions_e],
                    transcript=[t_transcript_e],
                    transcript_length=[t_transcript_len_e]
                )
                process_evaluation_batch(values_dict, _global_var_dict, labels=labels)

            # final aggregation across all workers and minibatches) and logging of results
            #lnw modified for cer
            #wer, eloss = process_evaluation_epoch(_global_var_dict)
            wer, eloss, cer = process_evaluation_epoch2(_global_var_dict)
        
            print_once("==========>>>>>>Evaluation Loss: {0}".format(eloss))
            print_once("==========>>>>>>Evaluation WER: {0}".format(wer))
            #lnw add for cer
            print_once("==========>>>>>>Evaluation CER: {0}".format(cer))
            #lnw add
            print("Evaluation end time : "+str(datetime.now()))
            
            
    print_once("Starting .....")
    start_time = time.time()

    train_dataloader = data_layer.data_iterator
    epoch = args.start_epoch
    step = epoch * args.step_per_epoch

    while True:
        if multi_gpu:
            data_layer.sampler.set_epoch(epoch)
        print_once("Starting epoch {0}, step {1}".format(epoch, step))
        #lnw add
        lEpochStart_time = datetime.now()
        print("Epoch Start time : "+str(lEpochStart_time))
        last_epoch_start = time.time()
        batch_counter = 0
        average_loss = 0
        for data in train_dataloader:
            tensors = []
            for d in data:
                if isinstance(d, torch.Tensor):
                    tensors.append(d.cuda())
                else:
                    tensors.append(d)

            if batch_counter == 0:

                if fn_lr_policy is not None:
                    adjusted_lr = fn_lr_policy(step) 
                    for param_group in optimizer.param_groups:
                            param_group['lr'] = adjusted_lr
                optimizer.zero_grad()
                last_iter_start = time.time()

            t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = tensors
            model.train()
            t_log_probs_t, t_encoded_len_t = model(x=(t_audio_signal_t, t_a_sig_length_t))

            t_loss_t = ctc_loss(log_probs=t_log_probs_t, targets=t_transcript_t, input_length=t_encoded_len_t, target_length=t_transcript_len_t)
            if args.gradient_accumulation_steps > 1:
                    t_loss_t = t_loss_t / args.gradient_accumulation_steps

            if optim_level in AmpOptimizations:
                with amp.scale_loss(t_loss_t, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                t_loss_t.backward()
            batch_counter += 1
            average_loss += t_loss_t.item()

            if batch_counter % args.gradient_accumulation_steps == 0:
                optimizer.step()

                if step % args.train_frequency == 0:
                    t_predictions_t = greedy_decoder(log_probs=t_log_probs_t)

                    e_tensors = [t_predictions_t, t_transcript_t, t_transcript_len_t]
                    #lnw modified for cer
                    #train_wer = monitor_asr_train_progress(e_tensors, labels=labels)
                    train_wer,train_cer  = monitor_asr_train_progress2(e_tensors, labels=labels)
                    print_once("Loss@Step: {0}  ::::::: {1}".format(step, str(average_loss)))
                    print_once("Step time: {0} seconds".format(time.time() - last_iter_start))

                    #lnw add for print wer cer
                    print_once("==========>>>>>>Train WER: {0}".format(train_wer))
                    print_once("==========>>>>>>Train CER: {0}".format(train_cer))

                if step > 0 and step % args.eval_frequency == 0:
                    print_once("Doing Evaluation ....................... ......  ... .. . .")
                    eval()
                step += 1
                batch_counter = 0
                average_loss = 0
                if args.num_steps is not None and step >= args.num_steps:
                    break

        if args.num_steps is not None and step >= args.num_steps:
            break
        print_once("Finished epoch {0} in {1}".format(epoch, time.time() - last_epoch_start))
        epoch += 1
        if epoch % args.save_frequency == 0 and epoch > 0:
            save(model, optimizer, epoch, output_dir=args.output_dir)
        if args.num_steps is None and epoch >= args.num_epochs:
            break

        #lnw add
        lEpochEnd_time = datetime.now()
        print("Epoch End time: "+str(lEpochEnd_time),"Duration:",str(lEpochEnd_time - lEpochStart_time),"SratTime-NowTime:",str(lEpochEnd_time - lstart_time))

    print_once("Done in {0}".format(time.time() - start_time))
    print_once("Final Evaluation ....................... ......  ... .. . .")
    eval()
    save(model, optimizer, epoch, output_dir=args.output_dir)
Exemplo n.º 6
0
def main(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    assert(torch.cuda.is_available())
    torch.backends.cudnn.benchmark = args.cudnn

    # set up distributed training
    if args.local_rank is not None:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')

    multi_gpu = torch.distributed.is_initialized()
    if multi_gpu:
        print_once("DISTRIBUTED TRAINING with {} gpus".format(torch.distributed.get_world_size()))

    # define amp optimiation level
    if args.fp16:
        optim_level = Optimization.mxprO1
    else:
        optim_level = Optimization.mxprO0

    jasper_model_definition = toml.load(args.model_toml)
    dataset_vocab = jasper_model_definition['labels']['labels']
    ctc_vocab = add_ctc_labels(dataset_vocab)

    train_manifest = args.train_manifest 
    val_manifest = args.val_manifest 
    featurizer_config = jasper_model_definition['input']
    featurizer_config_eval = jasper_model_definition['input_eval']
    featurizer_config["optimization_level"] = optim_level
    featurizer_config_eval["optimization_level"] = optim_level

    sampler_type = featurizer_config.get("sampler", 'default')
    perturb_config = jasper_model_definition.get('perturb', None)
    if args.pad_to_max:
        assert(args.max_duration > 0)
        featurizer_config['max_duration'] = args.max_duration
        featurizer_config_eval['max_duration'] = args.max_duration
        featurizer_config['pad_to'] = "max"
        featurizer_config_eval['pad_to'] = "max"
    print_once('model_config')
    print_dict(jasper_model_definition)
         
    if args.gradient_accumulation_steps < 1:
        raise ValueError('Invalid gradient accumulation steps parameter {}'.format(args.gradient_accumulation_steps))
    if args.batch_size % args.gradient_accumulation_steps != 0:
        raise ValueError('gradient accumulation step {} is not divisible by batch size {}'.format(args.gradient_accumulation_steps, args.batch_size))


    data_layer = AudioToTextDataLayer(
                                    dataset_dir=args.dataset_dir,
                                    featurizer_config=featurizer_config,
                                    perturb_config=perturb_config,
                                    manifest_filepath=train_manifest,
                                    labels=dataset_vocab,
                                    batch_size=args.batch_size // args.gradient_accumulation_steps,
                                    multi_gpu=multi_gpu,
                                    pad_to_max=args.pad_to_max,
                                    sampler=sampler_type)

    data_layer_eval = AudioToTextDataLayer(
                                    dataset_dir=args.dataset_dir,
                                    featurizer_config=featurizer_config_eval,
                                    manifest_filepath=val_manifest,
                                    labels=dataset_vocab,
                                    batch_size=args.batch_size,
                                    multi_gpu=multi_gpu,
                                    pad_to_max=args.pad_to_max
                                    )
 
    model = Jasper(feature_config=featurizer_config, jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab))
 
    if args.ckpt is not None:
        print_once("loading model from {}".format(args.ckpt))
        checkpoint = torch.load(args.ckpt, map_location="cpu")
        model.load_state_dict(checkpoint['state_dict'], strict=True)
        args.start_epoch = checkpoint['epoch']
    else:
        args.start_epoch = 0

    ctc_loss = CTCLossNM( num_classes=len(ctc_vocab))
    greedy_decoder = GreedyCTCDecoder()

    print_once("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights()))
    print_once("Number of parameters in decode: {0}".format(model.jasper_decoder.num_weights()))

    N = len(data_layer)
    if sampler_type == 'default':
        args.step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size())))
    elif sampler_type == 'bucket':
        args.step_per_epoch = int(len(data_layer.sampler) / args.batch_size )
    
    print_once('-----------------')
    print_once('Have {0} examples to train on.'.format(N))
    print_once('Have {0} steps / (gpu * epoch).'.format(args.step_per_epoch))
    print_once('-----------------')

    fn_lr_policy = lambda s: lr_policy(args.lr, s, args.num_epochs * args.step_per_epoch) 


    model.cuda()

    if args.optimizer_kind == "novograd":
        optimizer = Novograd(model.parameters(),
                        lr=args.lr,
                        weight_decay=args.weight_decay)
    elif args.optimizer_kind == "adam":
        optimizer = AdamW(model.parameters(),
                        lr=args.lr,
                        weight_decay=args.weight_decay)
    else:
        raise ValueError("invalid optimizer choice: {}".format(args.optimizer_kind))


    if optim_level in AmpOptimizations:
        model, optimizer = amp.initialize(
            #lnw block for error
            #min_loss_scale=1.0,
            models=model,
            optimizers=optimizer,
            opt_level=AmpOptimizations[optim_level])
    
    if args.ckpt is not None:
        optimizer.load_state_dict(checkpoint['optimizer'])

    model = model_multi_gpu(model, multi_gpu)

    train(
        data_layer=data_layer,
        data_layer_eval=data_layer_eval, 
        model=model, 
        ctc_loss=ctc_loss, 
        greedy_decoder=greedy_decoder,
        optimizer=optimizer, 
        labels=ctc_vocab, 
        optim_level=optim_level,
        multi_gpu=multi_gpu,
        fn_lr_policy=fn_lr_policy if args.lr_decay else None,
        args=args)
Exemplo n.º 7
0
    def evalutaion(epoch=0):
        model.eval()
        if args.ipex:
            if args.bf16:
                print("running bfloat16 evaluation step\n")
            else:
                print("running fp32 evaluation step\n")

        for dataset, frequency, name in eval_datasets:
            if epoch % frequency != 0:
                continue

            print_once(f"Doing {name} ....................... ......  ... .. . .")

            with torch.no_grad():
                _global_var_dict = {
                    'EvalLoss': [],
                    'predictions': [],
                    'transcripts': [],
                }
                dataloader = dataset.data_iterator
                for data in dataloader:
                    t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = data_transforms(data)

                    if args.ipex:
                        if args.bf16:
                            with torch.cpu.amp.autocast():
                                t_log_probs_t, (x_len, y_len) = model(
                                    ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)),
                                )
                        elif args.fp32:
                            t_log_probs_e, (x_len, y_len) = model(
                                ((t_audio_signal_e, t_transcript_e), (t_a_sig_length_e, t_transcript_len_e)),
                            )
                    else:
                        t_log_probs_e, (x_len, y_len) = model(
                            ((t_audio_signal_e, t_transcript_e), (t_a_sig_length_e, t_transcript_len_e)),
                        )
                    t_loss_e = loss_fn(
                        (t_log_probs_e, x_len), (t_transcript_e, y_len)
                    )
                    print(t_loss_e)
                    del t_log_probs_e

                    t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e)

                    values_dict = dict(
                        loss=[t_loss_e],
                        predictions=[t_predictions_e],
                        transcript=[t_transcript_e],
                        transcript_length=[t_transcript_len_e]
                    )
                    process_evaluation_batch(values_dict, _global_var_dict, labels=labels)

                # final aggregation across all workers and minibatches) and logging of results
                wer, eloss = process_evaluation_epoch(_global_var_dict)
                logger.log_scalar('loss', eloss, epoch, name)
                logger.log_scalar('wer', wer, epoch, name)

                print_once(f"==========>>>>>>{name} Loss: {eloss}\n")
                print_once(f"==========>>>>>>{name} WER: {wer}\n")
Exemplo n.º 8
0
def main(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    args.local_rank = os.environ.get('LOCAL_RANK', args.local_rank)
    # set up distributed training
    cpu_distributed_training = False
    if torch.distributed.is_available() and int(os.environ.get('PMI_SIZE', '0')) > 1:
        print('Distributed training with DDP')
        os.environ['RANK'] = os.environ.get('PMI_RANK', '0')
        os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', '1')
        if not 'MASTER_ADDR' in os.environ:
            os.environ['MASTER_ADDR'] = args.master_addr
        if not 'MASTER_PORT' in os.environ:
            os.environ['MASTER_PORT'] = args.port

        # Initialize the process group with ccl backend
        if args.backend == 'ccl':
            import torch_ccl
        dist.init_process_group(
                backend=args.backend                
        )
        cpu_distributed_training = True
        if torch.distributed.is_initialized():
            print("Torch distributed is initialized.")
            args.rank = torch.distributed.get_rank()
            args.world_size = torch.distributed.get_world_size()
        else:
            print("Torch distributed is not initialized.")
            args.rank = 0
            args.world_size = 1

    multi_gpu = False
    if multi_gpu:
        print_once("DISTRIBUTED TRAINING with {} gpus".format(torch.distributed.get_world_size()))

    optim_level = Optimization.mxprO0

    model_definition = toml.load(args.model_toml)
    dataset_vocab = model_definition['labels']['labels']
    ctc_vocab = add_blank_label(dataset_vocab)

    train_manifest = args.train_manifest
    val_manifest = args.val_manifest
    tst_manifest = args.tst_manifest
    featurizer_config = model_definition['input']
    featurizer_config_eval = model_definition['input_eval']
    featurizer_config["optimization_level"] = optim_level
    featurizer_config_eval["optimization_level"] = optim_level

    sampler_type = featurizer_config.get("sampler", 'default')
    perturb_config = model_definition.get('perturb', None)
    if args.pad_to_max:
        assert(args.max_duration > 0)
        featurizer_config['max_duration'] = args.max_duration
        featurizer_config_eval['max_duration'] = args.max_duration
        featurizer_config['pad_to'] = "max"
        featurizer_config_eval['pad_to'] = "max"
    print_once('model_config')
    print_dict(model_definition)

    if args.gradient_accumulation_steps < 1:
        raise ValueError('Invalid gradient accumulation steps parameter {}'.format(args.gradient_accumulation_steps))
    if args.batch_size % args.gradient_accumulation_steps != 0:
        raise ValueError('gradient accumulation step {} is not divisible by batch size {}'.format(args.gradient_accumulation_steps, args.batch_size))


    preprocessor = preprocessing.AudioPreprocessing(**featurizer_config)
    if args.cuda:
        preprocessor.cuda()
    else:
        preprocessor.cpu()

    augmentations = preprocessing.SpectrogramAugmentation(**featurizer_config)
    if args.cuda:
        augmentations.cuda()
    else:
        augmentations.cpu()

    train_transforms = torchvision.transforms.Compose([
        lambda xs: [x.cpu() for x in xs],
        lambda xs: [*preprocessor(xs[0:2]), *xs[2:]],
        lambda xs: [augmentations(xs[0]),   *xs[1:]],
        lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]],
    ])

    eval_transforms = torchvision.transforms.Compose([
        lambda xs: [x.cpu() for x in xs],
        lambda xs: [*preprocessor(xs[0:2]), *xs[2:]],
        lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]],
    ])

    data_layer = AudioToTextDataLayer(
                                    dataset_dir=args.dataset_dir,
                                    featurizer_config=featurizer_config,
                                    perturb_config=perturb_config,
                                    manifest_filepath=train_manifest,
                                    labels=dataset_vocab,
                                    batch_size=args.batch_size // args.gradient_accumulation_steps,
                                    multi_gpu=multi_gpu,
                                    pad_to_max=args.pad_to_max,
                                    sampler=sampler_type,
                                    cpu_distributed_training=cpu_distributed_training)

    eval_datasets = [(
        AudioToTextDataLayer(
            dataset_dir=args.dataset_dir,
            featurizer_config=featurizer_config_eval,
            manifest_filepath=val_manifest,
            labels=dataset_vocab,
            batch_size=args.eval_batch_size,
            multi_gpu=multi_gpu,
            pad_to_max=args.pad_to_max
        ),
        args.eval_frequency,
        'Eval clean',
    )]

    if tst_manifest:
        eval_datasets.append((
            AudioToTextDataLayer(
                dataset_dir=args.dataset_dir,
                featurizer_config=featurizer_config_eval,
                manifest_filepath=tst_manifest,
                labels=dataset_vocab,
                batch_size=args.eval_batch_size,
                multi_gpu=multi_gpu,
                pad_to_max=args.pad_to_max
            ),
            args.test_frequency,
            'Test other',
        ))

    model = RNNT(
        feature_config=featurizer_config,
        rnnt=model_definition['rnnt'],
        num_classes=len(ctc_vocab)
    )

    if args.ckpt is not None:
        print_once("loading model from {}".format(args.ckpt))
        checkpoint = torch.load(args.ckpt, map_location="cpu")
        model.load_state_dict(checkpoint['state_dict'], strict=True)
        args.start_epoch = checkpoint['epoch']
    else:
        args.start_epoch = 0

    loss_fn = RNNTLoss(blank=len(ctc_vocab) - 1)

    N = len(data_layer)
    if sampler_type == 'default':
        args.step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size())))
    elif sampler_type == 'bucket':
        args.step_per_epoch = int(len(data_layer.sampler) / args.batch_size )

    print_once('-----------------')
    print_once('Have {0} examples to train on.'.format(N))
    print_once('Have {0} steps / (gpu * epoch).'.format(args.step_per_epoch))
    print_once('-----------------')

    constant_lr_policy = lambda _: args.lr
    fn_lr_policy = constant_lr_policy
    if args.lr_decay:
        pre_decay_policy = fn_lr_policy
        fn_lr_policy = lambda s: lr_decay(args.num_epochs * args.step_per_epoch, s, pre_decay_policy(s))
    if args.lr_warmup:
        pre_warmup_policy = fn_lr_policy
        fn_lr_policy = lambda s: lr_warmup(args.lr_warmup, s, pre_warmup_policy(s) )

    if args.optimizer_kind == "novograd":
        optimizer = Novograd(model.parameters(),
                        lr=args.lr,
                        weight_decay=args.weight_decay)
    elif args.optimizer_kind == "adam":
        optimizer = AdamW(model.parameters(),
                        lr=args.lr,
                        weight_decay=args.weight_decay)
    else:
        raise ValueError("invalid optimizer choice: {}".format(args.optimizer_kind))

    if args.cuda and optim_level in AmpOptimizations:
        assert False, "not supported in ipex"

    if args.ckpt is not None:
        optimizer.load_state_dict(checkpoint['optimizer'])

    if args.ipex:
        if args.bf16:
            model, optimizer = ipex.optimize(model, dtype=torch.bfloat16, optimizer=optimizer)
            ipex.nn.utils._model_convert.replace_lstm_with_ipex_lstm(model)
        else:
            model, optimizer = ipex.optimize(model, dtype=torch.float32, optimizer=optimizer)
            ipex.nn.utils._model_convert.replace_lstm_with_ipex_lstm(model)

    if args.world_size > 1:
        device_ids = None
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=device_ids)

    print_once(model)
    print_once("# parameters: {}".format(sum(p.numel() for p in model.parameters())))
    greedy_decoder = RNNTGreedyDecoder(len(ctc_vocab) - 1, model.module if multi_gpu else model)

    if args.tb_path and args.local_rank == 0:
        logger = TensorBoardLogger(args.tb_path, model.module if multi_gpu else model, args.histogram)
    else:
        logger = DummyLogger()

    train(
        data_layer=data_layer,
        model=model,
        loss_fn=loss_fn,
        greedy_decoder=greedy_decoder,
        optimizer=optimizer,
        data_transforms=train_transforms,
        labels=ctc_vocab,
        optim_level=optim_level,
        multi_gpu=multi_gpu,
        fn_lr_policy=fn_lr_policy,
        evalutaion=evaluator(model, eval_transforms, loss_fn, greedy_decoder, ctc_vocab, eval_datasets, logger),
        logger=logger,
        args=args)
Exemplo n.º 9
0
def train(
        data_layer,
        model,
        loss_fn,
        greedy_decoder,
        optimizer,
        optim_level,
        labels,
        multi_gpu,
        data_transforms,
        args,
        evalutaion,
        logger,
        fn_lr_policy):
    """Trains model
    Args:
        data_layer: training data layer
        model: model ( encapsulates data processing, encoder, decoder)
        loss_fn: loss function
        greedy_decoder: greedy ctc decoder
        optimizer: optimizer
        optim_level: AMP optimization level
        labels: list of output labels
        multi_gpu: true if multi gpu training
        args: script input argument list
        fn_lr_policy: function returning lr in given step
    """
    print_once("Starting .....")
    start_time = time.time()

    train_dataloader = data_layer.data_iterator
    epoch = args.start_epoch
    step = epoch * args.step_per_epoch
    start_step = step

    if args.ipex:
        print("is ipex")
        if args.bf16:
            print("is bf16")
            print("running bfloat16 training step\n")
        elif args.fp32:
            print("running fp32 training step\n")
        total_time = 0
        while True:
            if multi_gpu:
                data_layer.sampler.set_epoch(epoch)
            print_once("Starting epoch {0}, step {1}".format(epoch, step))
            last_epoch_start = time.time()
            batch_counter = 0
            average_loss = 0
            for data in tqdm(train_dataloader):
                if batch_counter == 0:
                    adjusted_lr = fn_lr_policy(step)
                    for param_group in optimizer.param_groups:
                            param_group['lr'] = adjusted_lr
                    optimizer.zero_grad()
                    last_iter_start = time.time()

                t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = data_transforms(data)
                model.train()

                if args.profiling and (step - start_step) >= args.warmup:
                    with torch.profiler.profile(on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')) as prof:
                        if (step - start_step) >= args.warmup:
                            t0 = time.perf_counter()
                        if args.bf16:
                            with torch.cpu.amp.autocast():
                                t_log_probs_t, (x_len, y_len) = model(
                                    ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)),
                                )
                        elif args.fp32:
                            t_log_probs_t, (x_len, y_len) = model(
                                ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)),
                            )
                        if args.bf16:
                            t_log_probs_t = t_log_probs_t.to(torch.float32)
                        t_loss_t = loss_fn(
                            (t_log_probs_t, x_len), (t_transcript_t, y_len)
                        )
                        logger.log_scalar('loss', t_loss_t.item(), step)
                        del t_log_probs_t
                        if args.gradient_accumulation_steps > 1:
                            t_loss_t = t_loss_t / args.gradient_accumulation_steps

                        if args.cuda and optim_level in AmpOptimizations:
                            assert False, "not supported in ipex"
                        else:
                            t_loss_t.backward()
                        t1 = time.perf_counter()
                        if (step - start_step) >= args.warmup:
                            total_time += (t1 - t0)
                else:
                    if (step - start_step) >= args.warmup:
                        t0 = time.perf_counter()
                    if args.bf16:
                        with torch.cpu.amp.autocast():
                            t_log_probs_t, (x_len, y_len) = model(
                                ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)),
                            )
                    elif args.fp32:
                        t_log_probs_t, (x_len, y_len) = model(
                            ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)),
                        )
                    if args.bf16:
                        t_log_probs_t = t_log_probs_t.to(torch.float32)
                    t_loss_t = loss_fn(
                        (t_log_probs_t, x_len), (t_transcript_t, y_len)
                    )
                    logger.log_scalar('loss', t_loss_t.item(), step)
                    del t_log_probs_t
                    if args.gradient_accumulation_steps > 1:
                        t_loss_t = t_loss_t / args.gradient_accumulation_steps

                    if args.cuda and optim_level in AmpOptimizations:
                        assert False, "not supported in ipex"
                    else:
                        t_loss_t.backward()
                    t1 = time.perf_counter()
                    if (step - start_step) >= args.warmup:
                        total_time += (t1 - t0)

                batch_counter += 1
                average_loss += t_loss_t.item()

                if batch_counter % args.gradient_accumulation_steps == 0:
                    optimizer.step()

                    if (step + 1) % args.train_frequency == 0:
                        # t_predictions_t = greedy_decoder.decode(t_audio_signal_t, t_a_sig_length_t)

                        # e_tensors = [t_predictions_t, t_transcript_t, t_transcript_len_t]
                        # train_wer = monitor_asr_train_progress(e_tensors, labels=labels)
                        print_once("Loss@Step: {0}  ::::::: {1}".format(step, str(average_loss)))
                        print_once("Step time: {0} seconds".format(time.time() - last_iter_start))
                        # logger.log_scalar('wer', train_wer, step)

                    step += 1
                    batch_counter = 0
                    average_loss = 0
                    if args.num_steps is not None and step >= args.num_steps:
                        break

            # evalutaion(epoch)

            if args.num_steps is not None and step >= args.num_steps:
                break
            print_once("Finished epoch {0} in {1}".format(epoch, time.time() - last_epoch_start))
            epoch += 1
            if epoch % args.save_frequency == 0 and epoch > 0:
                save(model, optimizer, epoch, output_dir=args.output_dir)
            if args.num_steps is None and epoch >= args.num_epochs:
                break
        if args.profiling:
            print(prof.key_averages().table(sort_by="self_cpu_time_total"))

        print_once("Done in {0}".format(time.time() - start_time))
        if args.num_steps is not None:
            total_samples = (args.num_steps - args.warmup - start_step) * args.batch_size
        else:
            total_samples = len(data_layer) * (args.num_epochs - args.start_epoch) - args.warmup * args.batch_size
        print("total samples tested: ", total_samples)
        print("Model training time:", total_time, "s")
        perf = total_samples / total_time
        print("Throughput: {:.3f} fps".format(perf))
        # print_once("Final Evaluation ....................... ......  ... .. . .")
        # evalutaion()
        save(model, optimizer, epoch, output_dir=args.output_dir)
    else:
        total_time = 0
        while True:
            if multi_gpu:
                data_layer.sampler.set_epoch(epoch)
            print_once("Starting epoch {0}, step {1}".format(epoch, step))
            last_epoch_start = time.time()
            batch_counter = 0
            average_loss = 0

            for data in train_dataloader:

                if batch_counter == 0:

                    adjusted_lr = fn_lr_policy(step)
                    for param_group in optimizer.param_groups:
                            param_group['lr'] = adjusted_lr
                    optimizer.zero_grad()
                    last_iter_start = time.time()

                t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = data_transforms(data)
                model.train()

                if (step - start_step) >= args.warmup:
                    t0 = time.perf_counter()
                t_log_probs_t, (x_len, y_len) = model(
                    ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)),
                )

                t_loss_t = loss_fn(
                    (t_log_probs_t, x_len), (t_transcript_t, y_len)
                )
                print(t_loss_t)
                logger.log_scalar('loss', t_loss_t.item(), step)
                del t_log_probs_t
                if args.gradient_accumulation_steps > 1:
                    t_loss_t = t_loss_t / args.gradient_accumulation_steps

                if args.cuda and optim_level in AmpOptimizations:
                    assert False, "not supported in ipex"
                else:
                    t_loss_t.backward()
                t1 = time.perf_counter()
                if (step - start_step) >= args.warmup:
                    total_time += (t1 - t0)

                batch_counter += 1
                average_loss += t_loss_t.item()

                if batch_counter % args.gradient_accumulation_steps == 0:
                    optimizer.step()

                    if (step + 1) % args.train_frequency == 0:
                        t_predictions_t = greedy_decoder.decode(t_audio_signal_t, t_a_sig_length_t)

                        e_tensors = [t_predictions_t, t_transcript_t, t_transcript_len_t]
                        train_wer = monitor_asr_train_progress(e_tensors, labels=labels)
                        print_once("Loss@Step: {0}  ::::::: {1}".format(step, str(average_loss)))
                        print_once("Step time: {0} seconds".format(time.time() - last_iter_start))
                        logger.log_scalar('wer', train_wer, step)

                    step += 1
                    batch_counter = 0
                    average_loss = 0
                    if args.num_steps is not None and step >= args.num_steps:
                        break

            # evalutaion(epoch)

            if args.num_steps is not None and step >= args.num_steps:
                break
            print_once("Finished epoch {0} in {1}".format(epoch, time.time() - last_epoch_start))
            epoch += 1
            if epoch % args.save_frequency == 0 and epoch > 0:
                save(model, optimizer, epoch, output_dir=args.output_dir)
            if args.num_steps is None and epoch >= args.num_epochs:
                break
        print_once("Done in {0}".format(time.time() - start_time))
        if args.num_steps is not None:
            total_samples = (args.num_steps - args.warmup - start_step) * args.batch_size
        else:
            total_samples = len(data_layer) * (args.num_epochs - args.start_epoch) - args.warmup * args.batch_size
        print("total samples tested: ", total_samples)
        print("Model training time:", total_time, "s")
        perf = total_samples / total_time
        print("Throughput: {:.3f} fps".format(perf))
        # print_once("Final Evaluation ....................... ......  ... .. . .")
        # evalutaion()
        save(model, optimizer, epoch, output_dir=args.output_dir)
Exemplo n.º 10
0
def train(data_layer,
          data_layer_eval,
          model,
          ema_model,
          ctc_loss,
          greedy_decoder,
          optimizer,
          optim_level,
          labels,
          multi_gpu,
          args,
          fn_lr_policy=None):
    """Trains model
    Args:
        data_layer: training data layer
        data_layer_eval: evaluation data layer
        model: model ( encapsulates data processing, encoder, decoder)
        ctc_loss: loss function
        greedy_decoder: greedy ctc decoder
        optimizer: optimizer
        optim_level: AMP optimization level
        labels: list of output labels
        multi_gpu: true if multi gpu training
        args: script input argument list
        fn_lr_policy: learning rate adjustment function
    """
    def eval(model, name=''):
        """Evaluates model on evaluation dataset
        """
        with torch.no_grad():
            _global_var_dict = {
                'EvalLoss': [],
                'predictions': [],
                'transcripts': [],
            }
            eval_dataloader = data_layer_eval.data_iterator
            for data in eval_dataloader:
                tensors = []
                for d in data:
                    if isinstance(d, torch.Tensor):
                        tensors.append(d.cuda())
                    else:
                        tensors.append(d)
                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors

                model.eval()
                if optim_level == 1:
                    with amp.disable_casts():
                        t_processed_signal_e, t_processed_sig_length_e = audio_preprocessor(
                            t_audio_signal_e, t_a_sig_length_e)
                else:
                    t_processed_signal_e, t_processed_sig_length_e = audio_preprocessor(
                        t_audio_signal_e, t_a_sig_length_e)
                if jasper_encoder.use_conv_mask:
                    t_log_probs_e, t_encoded_len_e = model.forward(
                        (t_processed_signal_e, t_processed_sig_length_e))
                else:
                    t_log_probs_e = model.forward(t_processed_signal_e)
                t_loss_e = ctc_loss(log_probs=t_log_probs_e,
                                    targets=t_transcript_e,
                                    input_length=t_encoded_len_e,
                                    target_length=t_transcript_len_e)
                t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)

                values_dict = dict(loss=[t_loss_e],
                                   predictions=[t_predictions_e],
                                   transcript=[t_transcript_e],
                                   transcript_length=[t_transcript_len_e])
                process_evaluation_batch(values_dict,
                                         _global_var_dict,
                                         labels=labels)

            # final aggregation across all workers and minibatches) and logging of results
            wer, eloss = process_evaluation_epoch(_global_var_dict)

            if name != '':
                name = '_' + name

            print_once(f"==========>>>>>>Evaluation{name} Loss: {eloss}\n")
            print_once(f"==========>>>>>>Evaluation{name} WER: {wer}\n")

    print_once("Starting .....")
    start_time = time.time()

    train_dataloader = data_layer.data_iterator
    epoch = args.start_epoch
    step = epoch * args.step_per_epoch

    audio_preprocessor = model.module.audio_preprocessor if hasattr(
        model, 'module') else model.audio_preprocessor
    data_spectr_augmentation = model.module.data_spectr_augmentation if hasattr(
        model, 'module') else model.data_spectr_augmentation
    jasper_encoder = model.module.jasper_encoder if hasattr(
        model, 'module') else model.jasper_encoder

    while True:
        if multi_gpu:
            data_layer.sampler.set_epoch(epoch)
        print_once("Starting epoch {0}, step {1}".format(epoch, step))
        last_epoch_start = time.time()
        batch_counter = 0
        average_loss = 0
        for data in train_dataloader:
            tensors = []
            for d in data:
                if isinstance(d, torch.Tensor):
                    tensors.append(d.cuda())
                else:
                    tensors.append(d)

            if batch_counter == 0:

                if fn_lr_policy is not None:
                    adjusted_lr = fn_lr_policy(step)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = adjusted_lr
                optimizer.zero_grad()
                last_iter_start = time.time()

            t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = tensors
            model.train()
            if optim_level == 1:
                with amp.disable_casts():
                    t_processed_signal_t, t_processed_sig_length_t = audio_preprocessor(
                        t_audio_signal_t, t_a_sig_length_t)
            else:
                t_processed_signal_t, t_processed_sig_length_t = audio_preprocessor(
                    t_audio_signal_t, t_a_sig_length_t)
            t_processed_signal_t = data_spectr_augmentation(
                t_processed_signal_t)
            if jasper_encoder.use_conv_mask:
                t_log_probs_t, t_encoded_len_t = model.forward(
                    (t_processed_signal_t, t_processed_sig_length_t))
            else:
                t_log_probs_t = model.forward(t_processed_signal_t)

            t_loss_t = ctc_loss(log_probs=t_log_probs_t,
                                targets=t_transcript_t,
                                input_length=t_encoded_len_t,
                                target_length=t_transcript_len_t)
            if args.gradient_accumulation_steps > 1:
                t_loss_t = t_loss_t / args.gradient_accumulation_steps

            if 0 < optim_level <= 3:
                with amp.scale_loss(t_loss_t, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                t_loss_t.backward()
            batch_counter += 1
            average_loss += t_loss_t.item()

            if batch_counter % args.gradient_accumulation_steps == 0:
                optimizer.step()

                if step % args.train_frequency == 0:
                    t_predictions_t = greedy_decoder(log_probs=t_log_probs_t)

                    e_tensors = [
                        t_predictions_t, t_transcript_t, t_transcript_len_t
                    ]
                    train_wer = monitor_asr_train_progress(e_tensors,
                                                           labels=labels)
                    print_once("Loss@Step: {0}  ::::::: {1}".format(
                        step, str(average_loss)))
                    print_once(
                        "Step time: {0} seconds".format(time.time() -
                                                        last_iter_start))
                if step > 0 and step % args.eval_frequency == 0:
                    print_once(
                        "Doing Evaluation ....................... ......  ... .. . ."
                    )
                    eval(model)
                    if args.ema > 0:
                        eval(ema_model, 'EMA')

                step += 1
                batch_counter = 0
                average_loss = 0
                if args.num_steps is not None and step >= args.num_steps:
                    break

        if args.num_steps is not None and step >= args.num_steps:
            break
        print_once("Finished epoch {0} in {1}".format(
            epoch,
            time.time() - last_epoch_start))
        epoch += 1
        if epoch % args.save_frequency == 0 and epoch > 0:
            save(model, ema_model, optimizer, epoch, args.output_dir,
                 optim_level)
        if args.num_steps is None and epoch >= args.num_epochs:
            break
    print_once("Done in {0}".format(time.time() - start_time))
    print_once("Final Evaluation ....................... ......  ... .. . .")
    eval(model)
    if args.ema > 0:
        eval(ema_model, 'EMA')
    save(model, ema_model, optimizer, epoch, args.output_dir, optim_level)