Exemplo n.º 1
0
def eval(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels,
         multi_gpu, device, args):
    """performs inference / evaluation
    Args:
        data_layer: data layer object that holds data loader
        audio_processor: data processing module
        encoderdecoder: acoustic model
        greedy_decoder: greedy decoder
        labels: list of labels as output vocabulary
        multi_gpu: true if using multiple gpus
        args: script input arguments
    """
    logits_save_to = args.logits_save_to

    with torch.no_grad():
        if args.wav:
            audio, audio_len = audio_from_file(args.wav)
            run_once(audio_processor, encoderdecoder, greedy_decoder, audio,
                     audio_len, labels, device)
            if args.export_model:
                jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder = jit_export(
                    audio, audio_len, audio_processor, encoderdecoder,
                    greedy_decoder, args)
                run_once(jit_audio_processor, jit_encoderdecoder,
                         jit_greedy_decoder, audio, audio_len, labels, device)
            return
        wer, _global_var_dict = calc_wer(data_layer, audio_processor,
                                         encoderdecoder, greedy_decoder,
                                         labels, args, device)
        if (not multi_gpu
                or (multi_gpu and torch.distributed.get_rank() == 0)):
            print("==========>>>>>>Evaluation WER: {0}\n".format(wer))

            if args.save_prediction is not None:
                with open(args.save_prediction, 'w') as fp:
                    fp.write('\n'.join(_global_var_dict['predictions']))
            if logits_save_to is not None:
                logits = []
                for batch in _global_var_dict["logits"]:
                    for i in range(batch.shape[0]):
                        logits.append(batch[i].cpu().numpy())
                with open(logits_save_to, 'wb') as f:
                    pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL)
Exemplo n.º 2
0
def get_pytorch_components_and_onnx(args):
    '''Returns PyTorch components used for inference
    '''
    model_definition = toml.load(args.model_toml)
    dataset_vocab = model_definition['labels']['labels']
    # Set up global labels for future vocab calls
    global _global_ctc_labels
    _global_ctc_labels = add_ctc_labels(dataset_vocab)
    featurizer_config = model_definition['input_eval']

    optim_level = 3 if args.pyt_fp16 else 0

    featurizer_config["optimization_level"] = optim_level

    audio_preprocessor = None
    onnx_path = None
    data_layer = None
    wav = None
    seq_len = None

    if args.max_duration is not None:
        featurizer_config['max_duration'] = args.max_duration
    if args.dataset_dir is not None:
        data_layer = AudioToTextDataLayer(dataset_dir=args.dataset_dir,
                                          featurizer_config=featurizer_config,
                                          manifest_filepath=args.val_manifest,
                                          labels=dataset_vocab,
                                          batch_size=args.batch_size,
                                          shuffle=False)
    if args.wav is not None:
        args.batch_size = 1
        wav, seq_len = audio_from_file(args.wav)
        if args.seq_len is None or args.seq_len == 0:
            args.seq_len = seq_len / (featurizer_config['sample_rate'] / 100)

    if args.transpose:
        featurizer_config["transpose_out"] = True
        model_definition["transpose_in"] = True

    model = JasperEncoderDecoder(jasper_model_definition=model_definition,
                                 feat_in=1024,
                                 num_classes=len(get_vocab()),
                                 transpose_in=args.transpose)
    model = model.cuda()
    model.eval()

    audio_preprocessor = AudioPreprocessing(**featurizer_config)
    audio_preprocessor = audio_preprocessor.cuda()
    audio_preprocessor.eval()

    if args.ckpt_path is not None:
        if os.path.isdir(args.ckpt_path):
            d_checkpoint = torch.load(args.ckpt_path + "/decoder.pt",
                                      map_location="cpu")
            e_checkpoint = torch.load(args.ckpt_path + "/encoder.pt",
                                      map_location="cpu")
            model.jasper_encoder.load_state_dict(e_checkpoint, strict=False)
            model.jasper_decoder.load_state_dict(d_checkpoint, strict=False)
        else:
            checkpoint = torch.load(args.ckpt_path, map_location="cpu")
            model.load_state_dict(checkpoint['state_dict'], strict=False)

    # if we are to produce engine, not run/create ONNX, postpone AMP initialization
    # (ONNX parser cannot handle mixed FP16 ONNX yet)
    if args.pyt_fp16 and args.engine_path is None:
        amp.initialize(models=model, opt_level=AmpOptimizations[optim_level])

    if args.make_onnx:
        if args.onnx_path is None or args.ckpt_path is None:
            raise Exception(
                "--ckpt_path, --onnx_path must be provided when using --make_onnx"
            )
        onnx_path = get_onnx(args.onnx_path, model, args)

    if args.pyt_fp16 and args.engine_path is not None:
        amp.initialize(models=model, opt_level=AmpOptimizations[optim_level])

    return {
        'data_layer': data_layer,
        'audio_preprocessor': audio_preprocessor,
        'acoustic_model': model,
        'input_wav': (wav, seq_len)
    }, onnx_path
Exemplo n.º 3
0
def eval(
        data_layer,
        audio_processor,
        encoderdecoder,
        greedy_decoder,
        labels,
        multi_gpu,
        args):
    """performs inference / evaluation
    Args:
        data_layer: data layer object that holds data loader
        audio_processor: data processing module
        encoderdecoder: acoustic model
        greedy_decoder: greedy decoder
        labels: list of labels as output vocabulary
        multi_gpu: true if using multiple gpus
        args: script input arguments
    """
    if args.ipex:
        import intel_extension_for_pytorch as ipex

    logits_save_to=args.logits_save_to
    encoderdecoder.eval()
    with torch.no_grad():
        _global_var_dict = {
            'predictions': [],
            'transcripts': [],
            'logits' : [],
        }

        if args.wav:
            # TODO unimplemented in ipex
            assert False, "wav unsupported in ipex for now"
            features, p_length_e = audio_processor(audio_from_file(args.wav))
            # torch.cuda.synchronize()
            t0 = time.perf_counter()
            t_log_probs_e = encoderdecoder(features)
            # torch.cuda.synchronize()
            t1 = time.perf_counter()
            t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)
            hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=labels)
            print("INFERENCE TIME\t\t: {} ms".format((t1-t0)*1000.0))
            print("TRANSCRIPT\t\t:", hypotheses[0])
            return
        steps_per_epoch = len(data_layer)
        total_steps = args.steps if args.steps is not None else steps_per_epoch
        test_epoches = int(total_steps / steps_per_epoch)
        print('Evaluating RNNT: Steps per Epoch {} total Steps {}'.format(steps_per_epoch, total_steps))

        # Int8 Calibration
        if args.ipex and args.int8 and args.calibration:
            print("runing int8 calibration step\n")
            conf = ipex.AmpConf(torch.int8)            
            for it, data in enumerate(tqdm(data_layer.data_iterator)):
                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data)
                
                t_predictions_e, conf = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)

                if args.steps is not None and it + 1 >= args.steps:
                    break
            conf.save(args.configure_dir)
        # Inference (vanilla cpu, dnnl fp32 or dnnl int8)
        else:
            if not args.ipex:
                if args.warm_up > 0:
                    print("\nstart warm up, warmp_up steps = ", args.warm_up)
                    for it, data in enumerate(tqdm(data_layer.data_iterator)):
                            t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data)
                            conf = None
                            t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                            
                            if it + 1 >= args.warm_up:
                                break
                print("\nstart measure performance, measure steps = ", total_steps)
                total_time = 0
                with tqdm(total=total_steps) as pbar:
                    for epoch in range(test_epoches + 1):
                        for it, data in enumerate(data_layer.data_iterator):
                            if  epoch * steps_per_epoch + it >= total_steps:
                                break
                            t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data)
                            if args.profiling:
                                # with torch.autograd.profiler.profile(args.profiling) as prof:
                                with torch.profiler.profile(on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')) as prof:
                                    conf = None
                                    t0 = time.perf_counter()
                                    t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                                    t1 = time.perf_counter()
                            else:
                                conf = None
                                t0 = time.perf_counter()
                                t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                                t1 = time.perf_counter()

                            total_time += (t1 - t0)

                            values_dict = dict(
                                predictions=[t_predictions_e],
                                transcript=[t_transcript_e],
                                transcript_length=[t_transcript_len_e],
                            )
                            process_evaluation_batch(values_dict, _global_var_dict, labels=labels)

                            pbar.update(1)
            else:
                if args.mix_precision:
                    with torch.cpu.amp.autocast():
                        # warm up
                        if args.warm_up > 0:
                            print("\nstart warm up, warmp_up steps = ", args.warm_up)
                            for it, data in enumerate(tqdm(data_layer.data_iterator)):
                                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data)
                                conf = None
                                t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                                
                                if it + 1 >= args.warm_up:
                                    break

                        # measure performance
                        print("\nstart measure performance, measure steps = ", total_steps)
                        total_time = 0
                        # with torch.autograd.profiler.profile(args.profiling) as prof:
                        with tqdm(total=total_steps) as pbar:
                            for epoch in range(test_epoches + 1):
                                for it, data in enumerate(data_layer.data_iterator):
                                    if epoch * steps_per_epoch + it >= total_steps:
                                        break
                                    t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data)
                                    if args.profiling:
                                        # with torch.autograd.profiler.profile(args.profiling) as prof:
                                        with torch.profiler.profile(on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')) as prof:
                                            conf = None
                                            t0 = time.perf_counter()
                                            t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                                            t1 = time.perf_counter()
                                    else:
                                        conf = None
                                        t0 = time.perf_counter()
                                        t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                                        t1 = time.perf_counter()

                                    total_time += (t1 - t0)

                                    values_dict = dict(
                                        predictions=[t_predictions_e],
                                        transcript=[t_transcript_e],
                                        transcript_length=[t_transcript_len_e],
                                    )
                                    process_evaluation_batch(values_dict, _global_var_dict, labels=labels)

                                    pbar.update(1)
                else:
                    # warm up
                    if args.warm_up > 0:
                        print("\nstart warm up, warmp_up steps = ", args.warm_up)
                        for it, data in enumerate(tqdm(data_layer.data_iterator)):
                            t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data)
                            conf = None
                            t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                            
                            if it + 1 >= args.warm_up:
                                break

                    # measure performance
                    print("\nstart measure performance, measure steps = ", total_steps)
                    total_time = 0
                    # with torch.autograd.profiler.profile(args.profiling) as prof:
                    with tqdm(total=total_steps) as pbar:
                        for epoch in range(test_epoches + 1):
                            for it, data in enumerate(data_layer.data_iterator):
                                if epoch * steps_per_epoch + it >= total_steps:
                                    break
                                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = audio_processor(data)
                                if args.profiling:
                                    # with torch.autograd.profiler.profile(args.profiling) as prof:
                                    with torch.profiler.profile(on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')) as prof:
                                        conf = None
                                        t0 = time.perf_counter()
                                        t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                                        t1 = time.perf_counter()
                                else:
                                    conf = None
                                    t0 = time.perf_counter()
                                    t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e, args, conf)
                                    t1 = time.perf_counter()

                                total_time += (t1 - t0)

                                values_dict = dict(
                                    predictions=[t_predictions_e],
                                    transcript=[t_transcript_e],
                                    transcript_length=[t_transcript_len_e],
                                )
                                process_evaluation_batch(values_dict, _global_var_dict, labels=labels)

                                pbar.update(1)

            if args.print_result:
                hypotheses = _global_var_dict['predictions']
                references = _global_var_dict['transcripts']

                nb = len(hypotheses)
                print("print %d sample results: " % (min(len(hypotheses), nb)))
                for i, item in enumerate(hypotheses):
                    print("hyp: ", hypotheses[i])
                    print("ref: ", references[i])
                    print()
                    if i > nb:
                        break
            
            if args.profiling:
                # print(prof.key_averages().table(sort_by="cpu_time_total"))
                print(prof.key_averages().table(sort_by="self_cpu_time_total"))

            wer, _ = process_evaluation_epoch(_global_var_dict)
            if (not multi_gpu or (multi_gpu and torch.distributed.get_rank() == 0)):
                print("\n=========================>>>>>>")
                print("Evaluation WER: {0}".format(wer))
                print("Accuracy: {:.15f} ".format(1 - wer))
                if args.save_prediction is not None:
                    with open(args.save_prediction, 'w') as fp:
                        fp.write('\n'.join(_global_var_dict['predictions']))
                if logits_save_to is not None:
                    logits = []
                    for batch in _global_var_dict["logits"]:
                        for i in range(batch.shape[0]):
                            logits.append(batch[i].cpu().numpy())
                    with open(logits_save_to, 'wb') as f:
                        pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL)

            if args.steps:
                total_samples = args.steps * args.batch_size
            else:
                total_samples = len(data_layer)

            print("total samples tested: ", total_samples)
            print("total time (encoder + decoder, excluded audio processing): ", total_time, "s")
            print("dataset size: ", len(data_layer))

            perf = total_samples / total_time

            print("Throughput: {:.3f} fps".format(perf))
Exemplo n.º 4
0
def eval(
        data_layer,
        audio_processor,
        encoderdecoder,
        greedy_decoder,
        labels,
        multi_gpu,
        args):
    """performs inference / evaluation
    Args:
        data_layer: data layer object that holds data loader
        audio_processor: data processing module
        encoderdecoder: acoustic model
        greedy_decoder: greedy decoder
        labels: list of labels as output vocabulary
        multi_gpu: true if using multiple gpus
        args: script input arguments
    """
    logits_save_to=args.logits_save_to
    audio_processor.eval()
    encoderdecoder.eval()
    with torch.no_grad():
        _global_var_dict = {
            'predictions': [],
            'transcripts': [],
            'logits' : [],
        }


        
        if args.wav:
            features, p_length_e = audio_processor(audio_from_file(args.wav))
            torch.cuda.synchronize()
            t0 = time.perf_counter()
            t_log_probs_e = encoderdecoder(features)
            torch.cuda.synchronize()
            t1 = time.perf_counter()
            t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)
            hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=labels)
            print("INFERENCE TIME\t\t: {} ms".format((t1-t0)*1000.0))
            print("TRANSCRIPT\t\t:", hypotheses[0])
            return
        
        for it, data in enumerate(tqdm(data_layer.data_iterator)):
            tensors = []
            for d in data:
                tensors.append(d.cuda())

            t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors

            inp = (t_audio_signal_e, t_a_sig_length_e)

            t_processed_signal, p_length_e = audio_processor(x=inp)
            if args.use_conv_mask:
                t_log_probs_e, t_encoded_len_e  = encoderdecoder((t_processed_signal, p_length_e))
            else:
                t_log_probs_e  = encoderdecoder(t_processed_signal)
            t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)

            values_dict = dict(
                predictions=[t_predictions_e],
                transcript=[t_transcript_e],
                transcript_length=[t_transcript_len_e],
                output=[t_log_probs_e]
            )
            process_evaluation_batch(values_dict, _global_var_dict, labels=labels)

            if args.steps is not None and it + 1 >= args.steps:
                break
        wer, _ = process_evaluation_epoch(_global_var_dict)
        if (not multi_gpu or (multi_gpu and torch.distributed.get_rank() == 0)):
            print("==========>>>>>>Evaluation WER: {0}\n".format(wer))
            if args.save_prediction is not None:
                with open(args.save_prediction, 'w') as fp:
                    fp.write('\n'.join(_global_var_dict['predictions']))
            if logits_save_to is not None:
                logits = []
                for batch in _global_var_dict["logits"]:
                    for i in range(batch.shape[0]):
                        logits.append(batch[i].cpu().numpy())
                with open(logits_save_to, 'wb') as f:
                    pickle.dump(logits, f, protocol=pickle.HIGHEST_PROTOCOL)
Exemplo n.º 5
0
def get_pytorch_components_and_onnx(args):
    '''Returns PyTorch components used for inference
    '''
    model_definition = toml.load(args.model_toml)
    dataset_vocab = model_definition['labels']['labels']
    # Set up global labels for future vocab calls
    global _global_ctc_labels
    _global_ctc_labels = add_ctc_labels(dataset_vocab)
    featurizer_config = model_definition['input_eval']

    optim_level = Optimization.mxprO3 if args.pyt_fp16 else Optimization.mxprO0

    featurizer_config["optimization_level"] = optim_level
    acoustic_model = None
    audio_preprocessor = None
    onnx_path = None
    data_layer = None
    wav = None
    seq_len = None
    dtype = torch.float

    if args.max_duration is not None:
        featurizer_config['max_duration'] = args.max_duration
    if args.dataset_dir is not None:
        data_layer = AudioToTextDataLayer(dataset_dir=args.dataset_dir,
                                          featurizer_config=featurizer_config,
                                          manifest_filepath=args.val_manifest,
                                          labels=dataset_vocab,
                                          batch_size=args.batch_size,
                                          shuffle=False)
    if args.wav is not None:
        args.batch_size = 1
        args.engine_batch_size = 1
        wav, seq_len = audio_from_file(args.wav)
        if args.seq_len is None or args.seq_len == 0:
            args.seq_len = seq_len / (featurizer_config['sample_rate'] / 100)

    model = Jasper(feature_config=featurizer_config,
                   jasper_model_definition=model_definition,
                   feat_in=1024,
                   num_classes=len(get_vocab()))

    model.cuda()
    model.eval()
    acoustic_model = model.acoustic_model
    audio_preprocessor = model.audio_preprocessor

    if args.ckpt_path is not None:
        checkpoint = torch.load(args.ckpt_path, map_location="cpu")
        model.load_state_dict(checkpoint['state_dict'], strict=False)

    if args.make_onnx:
        if args.onnx_path is None or acoustic_model is None:
            raise Exception(
                "--ckpt_path, --onnx_path must be provided when using --make_onnx"
            )
        onnx_path = get_onnx(args.onnx_path,
                             acoustic_model,
                             signal_shape=(args.engine_batch_size, 64,
                                           args.seq_len),
                             dtype=torch.float)

    if args.pyt_fp16:
        amp.initialize(models=acoustic_model,
                       opt_level=AmpOptimizations[optim_level])

    return {
        'data_layer': data_layer,
        'audio_preprocessor': audio_preprocessor,
        'acoustic_model': acoustic_model,
        'input_wav': (wav, seq_len)
    }, onnx_path
def eval(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels,
         args):
    """performs evaluation and prints performance statistics
    Args:
        data_layer: data layer object that holds data loader
        audio_processor: data processing module
        encoderdecoder: acoustic model
        greedy_decoder: greedy decoder
        labels: list of labels as output vocabulary
        args: script input arguments
    """
    batch_size = args.batch_size
    steps = args.steps
    audio_processor.eval()
    encoderdecoder.eval()
    greedy_decoder.eval()

    # TORCHSCRIPT
    if args.cpu_run:
        audio, audio_len = audio_from_file(args.sample_audio, cpu_run=True)
        jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder = jit_export(
            audio, audio_len, audio_processor, encoderdecoder, greedy_decoder,
            args)

    with torch.no_grad():
        _global_var_dict = {
            'predictions': [],
            'transcripts': [],
        }

        it = 0
        ep = 0

        if steps is None:
            steps = math.ceil(len(data_layer) / batch_size)
        durations_dnn = []
        durations_dnn_and_prep = []
        seq_lens = []
        while True:
            ep += 1
            for data in tqdm(data_layer.data_iterator):
                it += 1
                if it > steps:
                    break
                tensors = []
                dl_device = torch.device(
                    "cpu") if args.cpu_run else torch.device("cuda")
                for d in data:
                    tensors.append(d.to(dl_device))

                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors
                if not args.cpu_run:
                    torch.cuda.synchronize()
                    t0 = time.perf_counter()
                    t_processed_signal = audio_processor(
                        t_audio_signal_e, t_a_sig_length_e)
                    torch.cuda.synchronize()
                    t1 = time.perf_counter()
                    t_log_probs_e, _ = encoderdecoder.infer(t_processed_signal)
                    torch.cuda.synchronize()
                    stop_time = time.perf_counter()
                    time_prep_and_dnn = stop_time - t0
                    time_dnn = stop_time - t1
                    t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)
                if args.cpu_run:
                    t0 = time.perf_counter()
                    t_processed_signal, _ = jit_audio_processor(
                        t_audio_signal_e, t_a_sig_length_e)
                    t1 = time.perf_counter()
                    t_log_probs_e, _ = jit_encoderdecoder(t_processed_signal)
                    stop_time = time.perf_counter()
                    time_prep_and_dnn = stop_time - t0
                    time_dnn = stop_time - t1
                    t_predictions_e = jit_greedy_decoder(
                        log_probs=t_log_probs_e)

                values_dict = dict(
                    predictions=[t_predictions_e],
                    transcript=[t_transcript_e],
                    transcript_length=[t_transcript_len_e],
                )
                process_evaluation_batch(values_dict,
                                         _global_var_dict,
                                         labels=labels)
                durations_dnn.append(time_dnn)
                durations_dnn_and_prep.append(time_prep_and_dnn)
                seq_lens.append(t_processed_signal[0].shape[-1])

            if it >= steps:

                wer, _ = process_evaluation_epoch(_global_var_dict)
                print(
                    "==========>>>>>>Evaluation of all iterations WER: {0}\n".
                    format(wer))
                break

        ratios = [0.9, 0.95, 0.99, 1.]
        latencies_dnn = take_durations_and_output_percentile(
            durations_dnn, ratios)
        latencies_dnn_and_prep = take_durations_and_output_percentile(
            durations_dnn_and_prep, ratios)
        print("\n using batch size {} and {} frames ".format(
            batch_size, seq_lens[-1]))
        print("\n".join([
            "dnn latency {} : {} ".format(k, v)
            for k, v in latencies_dnn.items()
        ]))
        print("\n".join([
            "prep + dnn latency {} : {} ".format(k, v)
            for k, v in latencies_dnn_and_prep.items()
        ]))