Exemplo n.º 1
0
    def test_eff_save_restore_from_nemo_file_encrypted(self, asr_model):
        """" Test makes sure that after encrypted save-restore the model has the same weights. """

        with tempfile.NamedTemporaryFile() as fp:
            filename = fp.name

            # Set key - use checkpoint encryption.
            NeMoArchive.set_encryption_key("test_key")

            # Save model (with random artifact).
            with tempfile.NamedTemporaryFile() as artifact:
                asr_model.register_artifact(config_path=None, src=artifact.name)
                asr_model.save_to(save_path=filename)

            # Try to restore the encrypted archive (weights) without the encryption key.
            NeMoArchive.set_encryption_key(None)
            with pytest.raises(PermissionError):
                # Restore the model.
                asr_model2 = EncDecCTCModel.restore_from(restore_path=filename)

            # Restore the model.
            NeMoArchive.set_encryption_key("test_key")
            asr_model3 = EncDecCTCModel.restore_from(restore_path=filename)
            # Reset encryption so it won't mess up with other save/restore.
            NeMoArchive.set_encryption_key(None)

            assert asr_model.num_weights == asr_model3.num_weights
Exemplo n.º 2
0
def oth_quartznet15x5_en_nr(pretrained=False, num_classes=29, **kwargs):
    from nemo.collections.asr.models import EncDecCTCModel
    quartznet_nemo_path = path_pref + "QuartzNet15x5NR-En_b05e34f3.nemo"
    raw_net = EncDecCTCModel.restore_from(quartznet_nemo_path)
    net = QuartzNet(raw_net=raw_net, num_classes=num_classes)
    net = net.cpu()
    return net
Exemplo n.º 3
0
def oth_quartznet15x5_ru(pretrained=False, num_classes=35, **kwargs):
    from nemo.collections.asr.models import EncDecCTCModel
    quartznet_nemo_path = path_pref + "stt_ru_quartznet15x5_88a3e5aa.nemo"
    raw_net = EncDecCTCModel.restore_from(quartznet_nemo_path)
    net = QuartzNet(raw_net=raw_net, num_classes=num_classes)
    net = net.cpu()
    return net
Exemplo n.º 4
0
def oth_quartznet15x5_ru34(pretrained=False, num_classes=34, **kwargs):
    from nemo.collections.asr.models import EncDecCTCModel
    quartznet_nemo_path = path_pref + "QuartzNet15x5_golos_1a63a2d8.nemo"
    raw_net = EncDecCTCModel.restore_from(quartznet_nemo_path)
    net = QuartzNet(raw_net=raw_net, num_classes=num_classes)
    net = net.cpu()
    return net#, raw_net
Exemplo n.º 5
0
    def test_save_restore_from_nemo_file(self, asr_model):
        """" Test makes sure that the second instance created from the same configuration AND checkpoint 
        has the same weights. """

        with tempfile.NamedTemporaryFile() as fp:
            filename = fp.name

            # Save model (with random artifact).
            with tempfile.NamedTemporaryFile() as artifact:
                asr_model.register_artifact(config_path=None,
                                            src=artifact.name)
                asr_model.save_to(save_path=filename)

            # Restore the model.
            asr_model2 = EncDecCTCModel.restore_from(restore_path=filename)

            assert len(asr_model.decoder.vocabulary) == len(
                asr_model2.decoder.vocabulary)
            assert asr_model.num_weights == asr_model2.num_weights

            w1 = asr_model.encoder.encoder[0].mconv[0].conv.weight.data.detach(
            ).cpu().numpy()
            w2 = asr_model2.encoder.encoder[0].mconv[
                0].conv.weight.data.detach().cpu().numpy()

            assert np.array_equal(w1, w2)
Exemplo n.º 6
0
def main(
    nemo_file,
    enemo_file,
    onnx_file,
    model_type="asr",
):
    if model_type == "asr":
        logging.info("Preparing ASR model")
        model = EncDecCTCModel.restore_from(nemo_file)
    elif model_type == "speech_label":
        logging.info("Preparing Speech Label Classification model")
        model = EncDecClassificationModel.restore_from(nemo_file)
    elif model_type == "speaker":
        logging.info("Preparing Speaker Recognition model")
        model = EncDecSpeakerLabelModel.restore_from(nemo_file)
    else:
        raise NameError(
            "Available model names are asr, speech_label and speaker")

    logging.info("Writing onnx file")
    model.export(onnx_file, onnx_opset_version=12)
    logging.info("succesfully ported onnx file")

    with tarfile.open(nemo_file, "r") as archive:
        archive.extract("./model_config.yaml")
        with tarfile.open(enemo_file, "w") as enemo_archive:
            enemo_archive.add("./model_config.yaml")
            copyfile(onnx_file, "model_graph.onnx")
            enemo_archive.add("model_graph.onnx")
            os.remove("model_graph.onnx")  # cleanup extra file
Exemplo n.º 7
0
def oth_jasperdr10x5_en_nr(pretrained=False, num_classes=29, **kwargs):
    from nemo.collections.asr.models import EncDecCTCModel
    quartznet_nemo_path = path_pref + "stt_en_jasper10x5dr_0d5ebc6c.nemo"
    raw_net = EncDecCTCModel.restore_from(quartznet_nemo_path)
    net = QuartzNet(raw_net=raw_net, num_classes=num_classes)
    net = net.cpu()
    return net#, raw_net
Exemplo n.º 8
0
def main(
    nemo_file,
    enemo_file,
    onnx_file,
    model_type='asr',
):
    if model_type == 'asr':
        logging.info("Preparing ASR model")
        model = EncDecCTCModel.restore_from(nemo_file)
    elif model_type == 'speech_label':
        logging.info("Preparing Speech Label Classification model")
        model = EncDecClassificationModel.restore_from(nemo_file)
    elif model_type == 'speaker':
        logging.info("Preparing Speaker Recognition model")
        model = EncDecSpeakerLabelModel.restore_from(nemo_file)
    else:
        raise NameError(
            "Available model names are asr, speech_label and speaker")

    logging.info("Writing onnx file")
    model.export(onnx_file, onnx_opset_version=12)
    logging.info("succesfully ported onnx file")

    with tarfile.open(nemo_file, 'r') as archive:
        archive.extract('./model_config.yaml')
        with tarfile.open(enemo_file, 'w') as enemo_archive:
            enemo_archive.add('./model_config.yaml')
            enemo_archive.addfile(tarfile.TarInfo("model_graph.onnx"),
                                  open(onnx_file))
Exemplo n.º 9
0
    def test_save_model_level_pt_ckpt(self, asr_model):
        with tempfile.TemporaryDirectory() as ckpt_dir:
            nemo_file = os.path.join(ckpt_dir, 'asr.nemo')
            asr_model.save_to(nemo_file)

            # Save model level PT checkpoint
            asr_model.extract_state_dict_from(nemo_file, ckpt_dir)
            ckpt_path = os.path.join(ckpt_dir, 'model_weights.ckpt')

            assert os.path.exists(ckpt_path)

            # Restore the model.
            asr_model2 = EncDecCTCModel.restore_from(restore_path=nemo_file)

            assert len(asr_model.decoder.vocabulary) == len(asr_model2.decoder.vocabulary)
            assert asr_model.num_weights == asr_model2.num_weights

            # Change weights values
            asr_model2.encoder.encoder[0].mconv[0].conv.weight.data += 1.0

            w1 = asr_model.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy()
            w2 = asr_model2.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy()

            assert not np.array_equal(w1, w2)

            # Restore from checkpoint
            asr_model2.load_state_dict(torch.load(ckpt_path))

            w1 = asr_model.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy()
            w2 = asr_model2.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy()

            assert np.array_equal(w1, w2)
Exemplo n.º 10
0
def oth_jasperdr10x5_en(pretrained=False, num_classes=29, **kwargs):
    from nemo.collections.asr.models import EncDecCTCModel
    quartznet_nemo_path = path_pref + "Jasper10x5Dr-En_2b94c9d1.nemo"
    raw_net = EncDecCTCModel.restore_from(quartznet_nemo_path)
    net = QuartzNet(raw_net=raw_net, num_classes=num_classes)
    net = net.cpu()
    return net
Exemplo n.º 11
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--asr_model", type=str, default="QuartzNet15x5Base-En", required=True, help="Pass: '******'",
    )
    parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data")
    parser.add_argument("--batch_size", type=int, default=4)
    parser.add_argument("--wer_tolerance", type=float, default=1.0, help="used by test")
    parser.add_argument(
        "--normalize_text", default=True, type=bool, help="Normalize transcripts or not. Set to False for non-English."
    )
    args = parser.parse_args()
    torch.set_grad_enabled(False)

    if args.asr_model.endswith('.nemo'):
        logging.info(f"Using local ASR model from {args.asr_model}")
        asr_model = EncDecCTCModel.restore_from(restore_path=args.asr_model)
    else:
        logging.info(f"Using NGC cloud ASR model {args.asr_model}")
        asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model)
    asr_model.setup_test_data(
        test_data_config={
            'sample_rate': 16000,
            'manifest_filepath': args.dataset,
            'labels': asr_model.decoder.vocabulary,
            'batch_size': args.batch_size,
            'normalize_transcripts': args.normalize_text,
        }
    )
    if can_gpu:
        asr_model = asr_model.cuda()
    asr_model.eval()
    labels_map = dict([(i, asr_model.decoder.vocabulary[i]) for i in range(len(asr_model.decoder.vocabulary))])
    wer = WER(vocabulary=asr_model.decoder.vocabulary)
    hypotheses = []
    references = []
    for test_batch in asr_model.test_dataloader():
        if can_gpu:
            test_batch = [x.cuda() for x in test_batch]
        with autocast():
            log_probs, encoded_len, greedy_predictions = asr_model(
                input_signal=test_batch[0], input_signal_length=test_batch[1]
            )
        hypotheses += wer.ctc_decoder_predictions_tensor(greedy_predictions)
        for batch_ind in range(greedy_predictions.shape[0]):
            reference = ''.join([labels_map[c] for c in test_batch[2][batch_ind].cpu().detach().numpy()])
            references.append(reference)
        del test_batch
    wer_value = word_error_rate(hypotheses=hypotheses, references=references)
    if wer_value > args.wer_tolerance:
        raise ValueError(f"Got WER of {wer_value}. It was higher than {args.wer_tolerance}")
    logging.info(f'Got WER of {wer_value}. Tolerance was {args.wer_tolerance}')
Exemplo n.º 12
0
    def test_save_restore_from_nemo_file_with_override(self, asr_model, tmpdir):
        """" Test makes sure that the second instance created from the same configuration AND checkpoint
        has the same weights.

        Args:
            tmpdir: fixture providing a temporary directory unique to the test invocation.
        """
        # Name of the archive in tmp folder.
        filename = os.path.join(tmpdir, "eff.nemo")

        # Get path where the command is executed - the artifacts will be "retrieved" there.
        # (original .nemo behavior)
        cwd = os.getcwd()

        with tempfile.NamedTemporaryFile(mode='a+') as conf_fp:

            # Create a "random artifact".
            with tempfile.NamedTemporaryFile(mode="w", delete=False) as artifact:
                artifact.write("magic content 42")
            # Remember the filename of the artifact.
            _, artifact_filename = os.path.split(artifact.name)
            # Add artifact to model.
            asr_model.register_artifact(config_path=None, src=artifact.name)
            # Save model (with "random artifact").
            asr_model.save_to(save_path=filename)

            # Modify config slightly
            cfg = asr_model.cfg
            cfg.encoder.params.activation = 'swish'
            yaml_cfg = OmegaConf.to_yaml(cfg)
            conf_fp.write(yaml_cfg)
            conf_fp.seek(0)

            # Restore the model.
            asr_model2 = EncDecCTCModel.restore_from(restore_path=filename, override_config_path=conf_fp.name)

            assert len(asr_model.decoder.vocabulary) == len(asr_model2.decoder.vocabulary)
            assert asr_model.num_weights == asr_model2.num_weights

            w1 = asr_model.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy()
            w2 = asr_model2.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy()

            assert np.array_equal(w1, w2)

            assert asr_model2.cfg.encoder.params.activation == 'swish'
Exemplo n.º 13
0
def infer(model, audiofiles, batch_size=4):

    asr_model = EncDecCTCModel.restore_from(model)

    mode = asr_model.training
    device = next(asr_model.parameters()).device
    asr_model.eval()
    vocab = asr_model._cfg.train_ds.labels
    with tempfile.TemporaryDirectory() as tmpdir:
        with open(os.path.join(tmpdir, 'manifest.json'), 'w') as fp:
            for file in audiofiles:
                entry = {
                    'audio_filepath': file,
                    'duration': 100000,
                    'text': 'nothing'
                }
                fp.write(json.dumps(entry) + '\n')

        config = {
            'paths2audio_files': audiofiles,
            'batch_size': batch_size,
            'temp_dir': tmpdir
        }

        characters = []
        log_probs = []
        temporary_datalayer = asr_model._setup_transcribe_dataloader(config)
        for test_batch in temporary_datalayer:
            log_prob, encoded_len, greedy_predictions = asr_model.forward(
                input_signal=test_batch[0].to(device),
                input_signal_length=test_batch[1].to(device))
            character = asr_model._wer.ctc_decoder_predictions_tensor(
                greedy_predictions)
            characters += character
            encoded_len = encoded_len.long().cpu()
            log_prob = log_prob.float().cpu()
            for i in range(0, encoded_len.shape[0]):
                el = encoded_len[i].detach().numpy().tolist()
                lp = log_prob[i].detach().numpy().tolist()

                log_probs += [lp[0:el]]
            del test_batch

    asr_model.train(mode)
    return characters, log_probs, vocab
Exemplo n.º 14
0
def main(
    nemo_file, onnx_file, model_type='asr',
):
    if model_type == 'asr':
        logging.info("Preparing ASR model")
        model = EncDecCTCModel.restore_from(nemo_file)
    elif model_type == 'speech_label':
        logging.info("Preparing Speech Label Classification model")
        model = EncDecClassificationModel.restore_from(nemo_file)
    elif model_type == 'speaker':
        logging.info("Preparing Speaker Recognition model")
        model = EncDecSpeakerLabelModel.restore_from(nemo_file)
    else:
        raise NameError("Available model names are asr, speech_label and speaker")

    logging.info("Writing onnx file")
    model.export(onnx_file, onnx_opset_version=12)
    logging.info("succesfully ported onnx file")
Exemplo n.º 15
0
def batch_inference(args: argparse.Namespace):

    torch.set_grad_enabled(False)

    if args.asr_model.endswith(".nemo"):
        print(f"Using local ASR model from {args.asr_model}")
        asr_model = EncDecCTCModel.restore_from(restore_path=args.asr_model)
    else:
        print(f"Using NGC cloud ASR model {args.asr_model}")
        asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model)

    manifest = prepare_manifest(args.corpora_dir, args.limit)
    asr_model.setup_test_data(
        test_data_config={
            "sample_rate": 16000,
            "manifest_filepath": manifest,
            "labels": asr_model.decoder.vocabulary,
            "batch_size": args.batch_size,
            "normalize_transcripts": args.normalize_text,
        })

    refs_hyps = list(tqdm(generate_ref_hyps(asr_model, args.search,
                                            args.arpa)))
    references, hypotheses = [list(k) for k in zip(*refs_hyps)]

    os.makedirs(args.results_dir, exist_ok=True)
    data_io.write_lines(f"{args.results_dir}/refs.txt.gz", references)
    data_io.write_lines(f"{args.results_dir}/hyps.txt.gz", hypotheses)

    wer_value = word_error_rate(hypotheses=hypotheses, references=references)
    sys.stdout.flush()
    stats = {
        "wer": wer_value,
        "args": args.__dict__,
    }
    data_io.write_json(f"{args.results_dir}/stats.txt", stats)
    print(f"Got WER of {wer_value}")
    return stats
Exemplo n.º 16
0
    def test_save_restore_from_nemo_file_with_override(self, asr_model):
        """" Test makes sure that the second instance created from the same configuration AND checkpoint
        has the same weights. """

        with tempfile.NamedTemporaryFile() as fp, tempfile.NamedTemporaryFile(
                mode='a+') as conf_fp:
            filename = fp.name

            # Save model (with random artifact).
            with tempfile.NamedTemporaryFile() as artifact:
                asr_model.register_artifact(config_path=None,
                                            src=artifact.name)
                asr_model.save_to(save_path=filename)

            # Modify config slightly
            cfg = asr_model.cfg
            cfg.encoder.params.activation = 'swish'
            yaml_cfg = OmegaConf.to_yaml(cfg)
            conf_fp.write(yaml_cfg)
            conf_fp.seek(0)

            # Restore the model.
            asr_model2 = EncDecCTCModel.restore_from(
                restore_path=filename, override_config_path=conf_fp.name)

            assert len(asr_model.decoder.vocabulary) == len(
                asr_model2.decoder.vocabulary)
            assert asr_model.num_weights == asr_model2.num_weights

            w1 = asr_model.encoder.encoder[0].mconv[0].conv.weight.data.detach(
            ).cpu().numpy()
            w2 = asr_model2.encoder.encoder[0].mconv[
                0].conv.weight.data.detach().cpu().numpy()

            assert np.array_equal(w1, w2)

            assert asr_model2.cfg.encoder.params.activation == 'swish'
Exemplo n.º 17
0
def ASR_Grade(dataset, id, key):
    try:
        from torch.cuda.amp import autocast
    except ImportError:
        from contextlib import contextmanager

        @contextmanager
        def autocast(enabled=None):
            yield

    can_gpu = torch.cuda.is_available()

    parser = ArgumentParser()
    parser.add_argument(
        "--asr_model",
        type=str,
        default=model_Selected,
        required=True,
        help=f'Pass: {model_Selected}',
    )
    parser.add_argument("--dataset",
                        type=str,
                        required=True,
                        help="path to evaluation data")
    parser.add_argument("--batch_size", type=int, default=4)
    parser.add_argument("--wer_tolerance",
                        type=float,
                        default=1.0,
                        help="used by test")
    parser.add_argument(
        "--normalize_text",
        default=False,  # False <- we're using phonetic references
        type=bool,
        help="Normalize transcripts or not. Set to False for non-English.",
    )
    args = parser.parse_args(
        ["--dataset", dataset, "--asr_model", model_Selected])
    torch.set_grad_enabled(False)

    # Instantiate Jasper/QuartzNet models with the EncDecCTCModel class.
    asr_model = EncDecCTCModel.restore_from(model_Path)

    asr_model.setup_test_data(
        test_data_config={
            "sample_rate": 16000,
            "manifest_filepath": args.dataset,
            "labels": asr_model.decoder.vocabulary,
            "batch_size": args.batch_size,
            "normalize_transcripts": args.normalize_text,
        })
    if can_gpu:  # noqa
        asr_model = asr_model.cuda()
    asr_model.eval()
    labels_map = dict([(i, asr_model.decoder.vocabulary[i])
                       for i in range(len(asr_model.decoder.vocabulary))])
    wer = WER(vocabulary=asr_model.decoder.vocabulary)
    hypotheses = []
    references = []
    for test_batch in asr_model.test_dataloader():
        if can_gpu:
            test_batch = [x.cuda() for x in test_batch]
        with autocast():
            log_probs, encoded_len, greedy_predictions = asr_model(
                input_signal=test_batch[0], input_signal_length=test_batch[1])
        hypotheses = wer.ctc_decoder_predictions_tensor(greedy_predictions)
        for batch_ind in range(greedy_predictions.shape[0]):
            reference = key
            #reference = "".join([labels_map[c] for c in test_batch[2][batch_ind].cpu().detach().numpy()])  #debug
            print(reference)  #debug
            references.append(reference)
        del test_batch
    wer_value = word_error_rate(hypotheses=hypotheses,
                                references=references)  #cer=True

    REC = '.'
    REF = '.'
    for h, r in zip(hypotheses, references):
        print("Recognized:\t{}\nReference:\t{}\n".format(h, r))
        REC = h
        REF = r
    logging.info(f"Got PER of {wer_value}. Tolerance was {args.wer_tolerance}")

    #Score Calculation, phoneme conversion
    # divide wer_value by wer_tolerance to get the ratio of correctness (and round it)
    # then multiply by 100 to get a value above 0
    # since this give the "% wrong", subtract from 100 to get "% correct"
    # this gives a positive grade to show return to the user
    score = 100.00 - (round((wer_value / args.wer_tolerance), 4) * 100)
    if score < 0.0:
        score = 0.0
    print(score)

    #Result file creation, to be accessed by JS via 'app.py'
    Results = open(datasetPath + id + '_graded.txt', 'w')
    Results.write(REC + '\n' + REF + '\n' + str(score))
    Results.close()
    return score
Exemplo n.º 18
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--asr_model",
        type=str,
        default="QuartzNet15x5Base-En",
        required=False,
        help="Pass: '******'",
    )
    parser.add_argument("--dataset",
                        type=str,
                        required=True,
                        help="path to evaluation data")
    parser.add_argument("--batch_size", type=int, default=4)
    parser.add_argument(
        "--normalize_text",
        default=True,
        type=bool,
        help="Normalize transcripts or not. Set to False for non-English.")
    parser.add_argument(
        "--sclite_fmt",
        default="trn",
        type=str,
        help="sclite output format. Only trn and ctm are supported")
    parser.add_argument("--out_dir",
                        type=str,
                        required=True,
                        help="Destination dir for output files")
    parser.add_argument("--sctk_dir",
                        type=str,
                        required=False,
                        default="",
                        help="Path to sctk root dir")
    parser.add_argument("--glm",
                        type=str,
                        required=False,
                        default="",
                        help="Path to glm file")
    parser.add_argument("--ref_stm",
                        type=str,
                        required=False,
                        default="",
                        help="Path to glm file")
    args = parser.parse_args()
    torch.set_grad_enabled(False)

    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    use_sctk = os.path.exists(args.sctk_dir)

    if args.asr_model.endswith('.nemo'):
        logging.info(f"Using local ASR model from {args.asr_model}")
        asr_model = EncDecCTCModel.restore_from(restore_path=args.asr_model)
    else:
        logging.info(f"Using NGC cloud ASR model {args.asr_model}")
        asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model)
    asr_model.setup_test_data(
        test_data_config={
            'sample_rate': 16000,
            'manifest_filepath': args.dataset,
            'labels': asr_model.decoder.vocabulary,
            'batch_size': args.batch_size,
            'normalize_transcripts': args.normalize_text,
        })
    if can_gpu:
        asr_model = asr_model.cuda()
    asr_model.eval()
    labels_map = dict([(i, asr_model.decoder.vocabulary[i])
                       for i in range(len(asr_model.decoder.vocabulary))])

    wer = WER(vocabulary=asr_model.decoder.vocabulary)
    hypotheses = []
    references = []
    all_log_probs = []
    for test_batch in asr_model.test_dataloader():
        if can_gpu:
            test_batch = [x.cuda() for x in test_batch]
        with autocast():
            log_probs, encoded_len, greedy_predictions = asr_model(
                input_signal=test_batch[0], input_signal_length=test_batch[1])
        for r in log_probs.cpu().numpy():
            all_log_probs.append(r)
        hypotheses += wer.ctc_decoder_predictions_tensor(greedy_predictions)
        for batch_ind in range(greedy_predictions.shape[0]):
            reference = ''.join([
                labels_map[c]
                for c in test_batch[2][batch_ind].cpu().detach().numpy()
            ])
            references.append(reference)
        del test_batch

    info_list = get_utt_info(args.dataset)
    hypfile = os.path.join(args.out_dir, "hyp.trn")
    reffile = os.path.join(args.out_dir, "ref.trn")
    with open(hypfile, "w") as hyp_f, open(reffile, "w") as ref_f:
        for i in range(len(hypotheses)):
            utt_id = os.path.splitext(
                os.path.basename(info_list[i]['audio_filepath']))[0]
            # rfilter in sctk likes each transcript to have a space at the beginning
            hyp_f.write(" " + hypotheses[i] + " (" + utt_id + ")" + "\n")
            ref_f.write(" " + references[i] + " (" + utt_id + ")" + "\n")

    if use_sctk:
        score_with_sctk(args.sctk_dir,
                        reffile,
                        hypfile,
                        args.out_dir,
                        glm=args.glm,
                        fmt="trn")
Exemplo n.º 19
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--asr_model",
                        type=str,
                        default="QuartzNet15x5Base-En",
                        required=True,
                        help="Pass: '******'")
    parser.add_argument("--dataset",
                        type=str,
                        required=True,
                        help="path to evaluation data")
    parser.add_argument("--num_batch",
                        type=int,
                        default=50,
                        help="number of batches of the synthetic data")
    parser.add_argument("--batch_size",
                        type=int,
                        default=8,
                        help="batch size of the synthetic data")
    parser.add_argument("--seqlen",
                        type=int,
                        default=500,
                        help="sequence length of the synthetic data")
    parser.add_argument(
        "--train_iter",
        type=int,
        default=200,
        help="training iterations for the synthetic data generation")
    parser.add_argument("--dump_path",
                        type=str,
                        default=None,
                        help="path to dump the synthetic data")
    parser.add_argument(
        "--dump_prefix",
        type=str,
        default='syn',
        help="prefix for the filename of the dumped synthetic data")
    parser.add_argument("--lr",
                        type=float,
                        default=0.01,
                        help="Learning rate for the synthetic data generation")

    args = parser.parse_args()

    torch.set_grad_enabled(False)

    if args.asr_model.endswith('.nemo'):
        logging.info(f"Using local ASR model from {args.asr_model}")
        teacher_model = EncDecCTCModel.restore_from(
            restore_path=args.asr_model)
    else:
        logging.info(f"Using NGC cloud ASR model {args.asr_model}")
        teacher_model = EncDecCTCModel.from_pretrained(
            model_name=args.asr_model)

    teacher_model.setup_test_data(
        test_data_config={
            'sample_rate': 16000,
            'manifest_filepath': args.dataset,
            'labels': teacher_model.decoder.vocabulary,
            'batch_size': 8,
            'normalize_transcripts': True,
            'shuffle': True,
        })

    ############################## Distillation #####################################

    teacher_model.set_quant_mode(
        'none')  # distable quantization mode for the teacher model
    torch.set_grad_enabled(True)  # enable backward graph generation

    print("Num batches: %d, Batch size: %d, Training iterations: %d, Learning rate: %.3f " \
            % (args.num_batch, args.batch_size, args.train_iter, args.lr))
    print('Synthesizing...')

    synthetic_data = get_synthetic_data(teacher_model.encoder,
                                        teacher_model.decoder,
                                        batch_size=args.batch_size,
                                        dim=64,
                                        seqlen=args.seqlen,
                                        num_batch=args.num_batch,
                                        train_iter=args.train_iter,
                                        lr=args.lr)

    file_name = '%s_nb%d_iter%d_lr%.3f.pkl' % \
            (args.dump_prefix,  args.num_batch, args.train_iter, args.lr)

    if args.dump_path is not None:
        if not os.path.exists(args.dump_path):
            os.makedirs(args.dump_path)
        file_name = os.path.join(args.dump_path, file_name)

    print('Synthetic data dumped as ', file_name)
    with open(file_name, 'wb') as f:
        pickle.dump([x.cpu() for x in synthetic_data], f)
Exemplo n.º 20
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--asr_model",
        type=str,
        default="QuartzNet15x5Base-En",
        required=True,
        help="Pass: '******'",
    )
    parser.add_argument(
        "--asr_onnx",
        type=str,
        default="./QuartzNet15x5Base-En-max-32.onnx",
        help="Pass: '******'",
    )
    parser.add_argument("--dataset",
                        type=str,
                        required=True,
                        help="path to evaluation data")
    parser.add_argument("--batch_size", type=int, default=4)
    parser.add_argument(
        "--dont_normalize_text",
        default=False,
        action='store_false',
        help="Turn off trasnscript normalization. Recommended for non-English.",
    )
    parser.add_argument(
        "--use_cer",
        default=False,
        action='store_true',
        help="Use Character Error Rate as the evaluation metric")
    parser.add_argument('--qat',
                        action="store_true",
                        help="Use onnx file exported from QAT tools")
    args = parser.parse_args()
    torch.set_grad_enabled(False)

    if args.asr_model.endswith('.nemo'):
        logging.info(f"Using local ASR model from {args.asr_model}")
        asr_model_cfg = EncDecCTCModel.restore_from(
            restore_path=args.asr_model, return_config=True)
        with open_dict(asr_model_cfg):
            asr_model_cfg.encoder.quantize = True
        asr_model = EncDecCTCModel.restore_from(
            restore_path=args.asr_model, override_config_path=asr_model_cfg)

    else:
        logging.info(f"Using NGC cloud ASR model {args.asr_model}")
        asr_model_cfg = EncDecCTCModel.from_pretrained(
            model_name=args.asr_model, return_config=True)
        with open_dict(asr_model_cfg):
            asr_model_cfg.encoder.quantize = True
        asr_model = EncDecCTCModel.from_pretrained(
            model_name=args.asr_model, override_config_path=asr_model_cfg)
    asr_model.setup_test_data(
        test_data_config={
            'sample_rate': 16000,
            'manifest_filepath': args.dataset,
            'labels': asr_model.decoder.vocabulary,
            'batch_size': args.batch_size,
            'normalize_transcripts': args.dont_normalize_text,
        })
    asr_model.preprocessor.featurizer.dither = 0.0
    asr_model.preprocessor.featurizer.pad_to = 0
    if can_gpu:
        asr_model = asr_model.cuda()
    asr_model.eval()
    labels_map = dict([(i, asr_model.decoder.vocabulary[i])
                       for i in range(len(asr_model.decoder.vocabulary))])
    wer = WER(vocabulary=asr_model.decoder.vocabulary, use_cer=args.use_cer)
    wer_result = evaluate(asr_model, args.asr_onnx, labels_map, wer, args.qat)
    logging.info(f'Got WER of {wer_result}.')
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--asr_model", type=str, default="QuartzNet15x5Base-En", required=True, help="Pass: '******'",
    )
    parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data")
    parser.add_argument("--wer_target", type=float, default=None, help="used by test")
    parser.add_argument("--batch_size", type=int, default=4)
    parser.add_argument("--wer_tolerance", type=float, default=1.0, help="used by test")
    parser.add_argument(
        "--normalize_text", default=True, type=bool, help="Normalize transcripts or not. Set to False for non-English."
    )
    parser.add_argument('--sensitivity', action="store_true", help="Perform sensitivity analysis")
    parser.add_argument('--onnx', action="store_true", help="Export to ONNX")
    args = parser.parse_args()
    torch.set_grad_enabled(False)

    quant_modules.initialize()

    if args.asr_model.endswith('.nemo'):
        logging.info(f"Using local ASR model from {args.asr_model}")
        asr_model = EncDecCTCModel.restore_from(restore_path=args.asr_model)
    else:
        logging.info(f"Using NGC cloud ASR model {args.asr_model}")
        asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model)
    asr_model.setup_test_data(
        test_data_config={
            'sample_rate': 16000,
            'manifest_filepath': args.dataset,
            'labels': asr_model.decoder.vocabulary,
            'batch_size': args.batch_size,
            'normalize_transcripts': args.normalize_text,
        }
    )
    if can_gpu:
        asr_model = asr_model.cuda()
    asr_model.eval()
    labels_map = dict([(i, asr_model.decoder.vocabulary[i]) for i in range(len(asr_model.decoder.vocabulary))])
    wer = WER(vocabulary=asr_model.decoder.vocabulary)
    wer_quant = evaluate(asr_model, labels_map, wer)
    logging.info(f'Got WER of {wer_quant}. Tolerance was {args.wer_tolerance}')

    if args.sensitivity:
        if wer_quant < args.wer_tolerance:
            logging.info("Tolerance is already met. Skip sensitivity analyasis.")
            return
        quant_layer_names = []
        for name, module in asr_model.named_modules():
            if isinstance(module, quant_nn.TensorQuantizer):
                module.disable()
                layer_name = name.replace("._input_quantizer", "").replace("._weight_quantizer", "")
                if layer_name not in quant_layer_names:
                    quant_layer_names.append(layer_name)
        logging.info(F"{len(quant_layer_names)} quantized layers found.")

        # Build sensitivity profile
        quant_layer_sensitivity = {}
        for i, quant_layer in enumerate(quant_layer_names):
            logging.info(F"Enable {quant_layer}")
            for name, module in asr_model.named_modules():
                if isinstance(module, quant_nn.TensorQuantizer) and quant_layer in name:
                    module.enable()
                    logging.info(F"{name:40}: {module}")

            # Eval the model
            wer_value = evaluate(asr_model, labels_map, wer)
            logging.info(F"WER: {wer_value}")
            quant_layer_sensitivity[quant_layer] = args.wer_tolerance - wer_value

            for name, module in asr_model.named_modules():
                if isinstance(module, quant_nn.TensorQuantizer) and quant_layer in name:
                    module.disable()
                    logging.info(F"{name:40}: {module}")

        # Skip most sensitive layers until WER target is met
        for name, module in asr_model.named_modules():
            if isinstance(module, quant_nn.TensorQuantizer):
                module.enable()
        quant_layer_sensitivity = collections.OrderedDict(sorted(quant_layer_sensitivity.items(), key=lambda x: x[1]))
        pprint(quant_layer_sensitivity)
        skipped_layers = []
        for quant_layer, _ in quant_layer_sensitivity.items():
            for name, module in asr_model.named_modules():
                if isinstance(module, quant_nn.TensorQuantizer):
                    if quant_layer in name:
                        logging.info(F"Disable {name}")
                        if not quant_layer in skipped_layers:
                            skipped_layers.append(quant_layer)
                        module.disable()
            wer_value = evaluate(asr_model, labels_map, wer)
            if wer_value <= args.wer_tolerance:
                logging.info(
                    F"WER tolerance {args.wer_tolerance} is met by skipping {len(skipped_layers)} sensitive layers."
                )
                print(skipped_layers)
                return
        raise ValueError(f"WER tolerance {args.wer_tolerance} can not be met with any layer quantized!")

    if args.onnx:
        if args.asr_model.endswith("nemo"):
            onnx_name = args.asr_model.replace(".nemo", ".onnx")
        else:
            onnx_name = args.asr_model
        logging.info("Export to ", onnx_name)
        quant_nn.TensorQuantizer.use_fb_fake_quant = True
        asr_model.export(onnx_name, onnx_opset_version=13)
        quant_nn.TensorQuantizer.use_fb_fake_quant = False
Exemplo n.º 22
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--asr_model",
        type=str,
        default="QuartzNet15x5Base-En",
        required=True,
        help="Pass: '******'",
    )
    parser.add_argument("--dataset",
                        type=str,
                        required=True,
                        help="path to evaluation data")
    parser.add_argument("--batch_size", type=int, default=256)
    parser.add_argument(
        "--normalize_text",
        default=True,
        type=bool,
        help="Normalize transcripts or not. Set to False for non-English.")
    parser.add_argument('--num_calib_batch',
                        default=1,
                        type=int,
                        help="Number of batches for calibration.")
    parser.add_argument('--calibrator',
                        type=str,
                        choices=["max", "histogram"],
                        default="max")
    parser.add_argument('--percentile',
                        nargs='+',
                        type=float,
                        default=[99.9, 99.99, 99.999, 99.9999])
    parser.add_argument("--amp",
                        action="store_true",
                        help="Use AMP in calibration.")
    parser.set_defaults(amp=False)

    args = parser.parse_args()
    torch.set_grad_enabled(False)

    # Initialize quantization
    quant_desc_input = QuantDescriptor(calib_method=args.calibrator)
    quant_nn.QuantConv2d.set_default_quant_desc_input(quant_desc_input)
    quant_nn.QuantConvTranspose2d.set_default_quant_desc_input(
        quant_desc_input)
    quant_nn.QuantLinear.set_default_quant_desc_input(quant_desc_input)

    if args.asr_model.endswith('.nemo'):
        logging.info(f"Using local ASR model from {args.asr_model}")
        asr_model_cfg = EncDecCTCModel.restore_from(
            restore_path=args.asr_model, return_config=True)
        with open_dict(asr_model_cfg):
            asr_model_cfg.encoder.quantize = True
        asr_model = EncDecCTCModel.restore_from(
            restore_path=args.asr_model, override_config_path=asr_model_cfg)

    else:
        logging.info(f"Using NGC cloud ASR model {args.asr_model}")
        asr_model_cfg = EncDecCTCModel.from_pretrained(
            model_name=args.asr_model, return_config=True)
        with open_dict(asr_model_cfg):
            asr_model_cfg.encoder.quantize = True
        asr_model = EncDecCTCModel.from_pretrained(
            model_name=args.asr_model, override_config_path=asr_model_cfg)

    asr_model.setup_test_data(
        test_data_config={
            'sample_rate': 16000,
            'manifest_filepath': args.dataset,
            'labels': asr_model.decoder.vocabulary,
            'batch_size': args.batch_size,
            'normalize_transcripts': args.normalize_text,
            'shuffle': True,
        })

    if can_gpu:
        asr_model = asr_model.cuda()
    asr_model.eval()

    # Enable calibrators
    for name, module in asr_model.named_modules():
        if isinstance(module, quant_nn.TensorQuantizer):
            if module._calibrator is not None:
                module.disable_quant()
                module.enable_calib()
            else:
                module.disable()

    for i, test_batch in enumerate(asr_model.test_dataloader()):
        if can_gpu:
            test_batch = [x.cuda() for x in test_batch]
        if args.amp:
            with autocast():
                _ = asr_model(input_signal=test_batch[0],
                              input_signal_length=test_batch[1])
        else:
            _ = asr_model(input_signal=test_batch[0],
                          input_signal_length=test_batch[1])
        if i >= args.num_calib_batch:
            break

    # Save calibrated model(s)
    model_name = args.asr_model.replace(
        ".nemo", "") if args.asr_model.endswith(".nemo") else args.asr_model
    if not args.calibrator == "histogram":
        compute_amax(asr_model, method="max")
        asr_model.save_to(
            F"{model_name}-max-{args.num_calib_batch*args.batch_size}.nemo")
    else:
        for percentile in args.percentile:
            print(F"{percentile} percentile calibration")
            compute_amax(asr_model, method="percentile")
            asr_model.save_to(
                F"{model_name}-percentile-{percentile}-{args.num_calib_batch*args.batch_size}.nemo"
            )

        for method in ["mse", "entropy"]:
            print(F"{method} calibration")
            compute_amax(asr_model, method=method)
            asr_model.save_to(
                F"{model_name}-{method}-{args.num_calib_batch*args.batch_size}.nemo"
            )
Exemplo n.º 23
0
def main():
    parser = ArgumentParser()
    """Training arguments"""
    parser.add_argument("--asr_model",
                        type=str,
                        default="QuartzNet15x5Base-En",
                        required=True,
                        help="Pass: '******'")
    parser.add_argument("--dataset",
                        type=str,
                        required=True,
                        help="path to evaluation data")
    parser.add_argument("--batch_size", type=int, default=8)
    parser.add_argument(
        "--normalize_text",
        default=True,
        type=bool,
        help="Normalize transcripts or not. Set to False for non-English.")
    parser.add_argument("--shuffle",
                        action='store_true',
                        help="Shuffle test data.")
    """Calibration arguments"""
    parser.add_argument("--load",
                        type=str,
                        default=None,
                        help="load path for the synthetic data")
    parser.add_argument(
        "--percentile",
        type=float,
        default=None,
        help="Max/min percentile for outlier handling. e.g., 99.9")
    """Quantization arguments"""
    parser.add_argument("--weight_bit",
                        type=int,
                        default=8,
                        help="quantization bit for weights")
    parser.add_argument("--act_bit",
                        type=int,
                        default=8,
                        help="quantization bit for activations")
    parser.add_argument("--dynamic",
                        action='store_true',
                        help="Dynamic quantization mode.")
    parser.add_argument("--no_quant",
                        action='store_true',
                        help="No quantization mode.")
    """Debugging arguments"""
    parser.add_argument("--eval_early_stop",
                        type=int,
                        default=None,
                        help="early stop for debugging")
    parser.add_argument("--calib_early_stop",
                        type=int,
                        default=None,
                        help="early stop calibration")

    args = parser.parse_args()

    torch.set_grad_enabled(False)

    if args.asr_model.endswith('.nemo'):
        logging.info(f"Using local ASR model from {args.asr_model}")
        asr_model = EncDecCTCModel.restore_from(restore_path=args.asr_model)
    else:
        logging.info(f"Using NGC cloud ASR model {args.asr_model}")
        asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model)

    asr_model.setup_test_data(
        test_data_config={
            'sample_rate': 16000,
            'manifest_filepath': args.dataset,
            'labels': asr_model.decoder.vocabulary,
            'batch_size': args.batch_size,
            'normalize_transcripts': args.normalize_text,
            'shuffle': args.shuffle,
        })

    if args.load is not None:
        print('Data loaded from %s' % args.load)
        with open(args.load, 'rb') as f:
            distilled_data = pickle.load(f)
        synthetic_batch_size, _, synthetic_seqlen = distilled_data[0].shape
    else:
        assert args.dynamic, \
                "synthetic data must be loaded unless running with the dynamic quantization mode"

    ############################## Calibration #####################################

    torch.set_grad_enabled(False)  # disable backward graph generation
    asr_model.eval()  # evaluation mode
    asr_model.set_quant_bit(args.weight_bit, mode='weight')
    asr_model.set_quant_bit(args.act_bit, mode='act')

    # set percentile
    if args.percentile is not None:
        qm.set_percentile(asr_model, args.percentile)

    if args.no_quant:
        asr_model.set_quant_mode('none')
    else:
        asr_model.encoder.bn_folding()  # BN folding

    # if not dynamic quantization, calibrate min/max/range for the activations using synthetic data
    # if dynamic, we can skip calibration
    if not args.dynamic:
        print('Calibrating...')
        qm.calibrate(asr_model)
        length = torch.tensor([synthetic_seqlen] * synthetic_batch_size).cuda()
        for batch_idx, inputs in enumerate(distilled_data):
            if args.calib_early_stop is not None and batch_idx == args.calib_early_stop:
                break
            inputs = inputs.cuda()
            encoded, encoded_len, encoded_scaling_factor = asr_model.encoder(
                audio_signal=inputs, length=length)
            log_probs = asr_model.decoder(
                encoder_output=encoded,
                encoder_output_scaling_factor=encoded_scaling_factor)

    ############################## Evaluation  #####################################

    print('Evaluating...')
    qm.evaluate(asr_model)

    qm.set_dynamic(
        asr_model,
        args.dynamic)  # if dynamic quantization, this will be enabled
    labels_map = dict([(i, asr_model.decoder.vocabulary[i])
                       for i in range(len(asr_model.decoder.vocabulary))])
    wer = WER(vocabulary=asr_model.decoder.vocabulary)
    hypotheses = []
    references = []
    progress_bar = tqdm(asr_model.test_dataloader())

    for i, test_batch in enumerate(progress_bar):
        if i == args.eval_early_stop:
            break
        test_batch = [x.cuda().float() for x in test_batch]
        with autocast():
            log_probs, encoded_len, greedy_predictions = asr_model(
                input_signal=test_batch[0], input_signal_length=test_batch[1])
        hypotheses += wer.ctc_decoder_predictions_tensor(greedy_predictions)
        for batch_ind in range(greedy_predictions.shape[0]):
            reference = ''.join([
                labels_map[c]
                for c in test_batch[2][batch_ind].cpu().detach().numpy()
            ])
            references.append(reference)
        del test_batch
    wer_value = word_error_rate(hypotheses=hypotheses, references=references)
    print('WER:', wer_value)