示例#1
0
    def __init__(self, path='ConformerS.h5'):
        # fetch and load the config of the model
        config = Config('tamil_tech/configs/conformer_new_config.yml', learning=True)

        # load speech and text featurizers
        speech_featurizer = TFSpeechFeaturizer(config.speech_config)
        text_featurizer = CharFeaturizer(config.decoder_config)

        # check if model already exists in given path, else download the model in the given path
        if os.path.exists(path):
          pass
        else:
          print("Downloading Model...")
          file_id = config.file_id
          download_file_from_google_drive(file_id, path)
          print("Downloaded Model Successfully...")
        
        # load model using config
        self.model = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
        # set shape of the featurizer and build the model
        self.model._build(speech_featurizer.shape)
        # load weights of the model
        self.model.load_weights(path, by_name=True)
        # display model summary
        self.model.summary(line_length=120)
        # set featurizers for the model
        self.model.add_featurizers(speech_featurizer, text_featurizer)

        print("Loaded Model...!")
示例#2
0
def test_ds2():
    config = Config(DEFAULT_YAML)

    text_featurizer = CharFeaturizer(config.decoder_config)

    speech_featurizer = TFSpeechFeaturizer(config.speech_config)

    model = DeepSpeech2(vocabulary_size=text_featurizer.num_classes, **config.model_config)

    model._build(speech_featurizer.shape)
    model.summary(line_length=150)

    model.add_featurizers(speech_featurizer=speech_featurizer, text_featurizer=text_featurizer)

    concrete_func = model.make_tflite_function(greedy=False).get_concrete_function()
    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.experimental_new_converter = True
    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
    converter.convert()

    print("Converted successfully with beam search")

    concrete_func = model.make_tflite_function(greedy=True).get_concrete_function()
    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.experimental_new_converter = True
    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
    converter.convert()

    print("Converted successfully with greedy")
            "num_masks": 10,
            "mask_factor": 100,
            "p_upperbound": 0.05
        },
        "freq_masking": {
            "mask_factor": 27
        }
    },
    "include_original": False
}

data = "/mnt/Data/ML/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv"

text_featurizer = CharFeaturizer({
    "vocabulary": None,
    "blank_at_zero": True,
    "beam_width": 5,
    "norm_score": True
})

speech_featurizer = TFSpeechFeaturizer({
    "sample_rate": 16000,
    "frame_ms": 25,
    "stride_ms": 10,
    "num_feature_bins": 80,
    "feature_type": "log_mel_spectrogram",
    "preemphasis": 0.97,
    "normalize_signal": True,
    "normalize_feature": True,
    "normalize_per_feature": False
})
示例#4
0
from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer
from tensorflow_asr.models.ctc.jasper import Jasper
from tensorflow_asr.utils import app_util

config = Config(args.config)
speech_featurizer = TFSpeechFeaturizer(config.speech_config)

if args.sentence_piece:
    print("Use SentencePiece ...")
    text_featurizer = SentencePieceFeaturizer(config.decoder_config)
elif args.subwords:
    print("Use subwords ...")
    text_featurizer = SubwordFeaturizer(config.decoder_config)
else:
    print("Use characters ...")
    text_featurizer = CharFeaturizer(config.decoder_config)

tf.random.set_seed(0)

if args.tfrecords:
    test_dataset = ASRTFRecordDataset(
        speech_featurizer=speech_featurizer,
        text_featurizer=text_featurizer,
        **vars(config.learning_config.test_dataset_config))
else:
    test_dataset = ASRSliceDataset(
        speech_featurizer=speech_featurizer,
        text_featurizer=text_featurizer,
        **vars(config.learning_config.test_dataset_config))

# build model
示例#5
0
setup_devices([args.device])

from tensorflow_asr.configs.user_config import UserConfig
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
from tensorflow_asr.runners.base_runners import BaseTester
from model import DeepSpeech2

tf.random.set_seed(0)
assert args.export

config = UserConfig(DEFAULT_YAML, args.config, learning=True)
speech_featurizer = TFSpeechFeaturizer(config["speech_config"])
text_featurizer = CharFeaturizer(config["decoder_config"])
# Build DS2 model
ds2_model = DeepSpeech2(input_shape=speech_featurizer.shape,
                        arch_config=config["model_config"],
                        num_classes=text_featurizer.num_classes,
                        name="deepspeech2")
ds2_model._build(speech_featurizer.shape)
ds2_model.load_weights(args.saved, by_name=True)
ds2_model.summary(line_length=150)
ds2_model.add_featurizers(speech_featurizer, text_featurizer)

if args.tfrecords:
    test_dataset = ASRTFRecordDataset(
        data_paths=config["learning_config"]["dataset_config"]["test_paths"],
        tfrecords_dir=config["learning_config"]["dataset_config"]["tfrecords_dir"],
        speech_featurizer=speech_featurizer,
示例#6
0
def test_contextnet():
    config = Config(DEFAULT_YAML)

    text_featurizer = CharFeaturizer(config.decoder_config)

    speech_featurizer = TFSpeechFeaturizer(config.speech_config)

    model = ContextNet(vocabulary_size=text_featurizer.num_classes,
                       **config.model_config)

    model.make(speech_featurizer.shape)
    model.summary(line_length=150)

    model.add_featurizers(speech_featurizer=speech_featurizer,
                          text_featurizer=text_featurizer)

    concrete_func = model.make_tflite_function(
        timestamp=False).get_concrete_function()
    converter = tf.lite.TFLiteConverter.from_concrete_functions(
        [concrete_func])
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.experimental_new_converter = True
    converter.target_spec.supported_ops = [
        tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS
    ]
    tflite = converter.convert()

    logger.info("Converted successfully with no timestamp")

    concrete_func = model.make_tflite_function(
        timestamp=True).get_concrete_function()
    converter = tf.lite.TFLiteConverter.from_concrete_functions(
        [concrete_func])
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.experimental_new_converter = True
    converter.target_spec.supported_ops = [
        tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS
    ]
    converter.convert()

    logger.info("Converted successfully with timestamp")

    tflitemodel = tf.lite.Interpreter(model_content=tflite)
    signal = tf.random.normal([4000])

    input_details = tflitemodel.get_input_details()
    output_details = tflitemodel.get_output_details()
    tflitemodel.resize_tensor_input(input_details[0]["index"], [4000])
    tflitemodel.allocate_tensors()
    tflitemodel.set_tensor(input_details[0]["index"], signal)
    tflitemodel.set_tensor(input_details[1]["index"],
                           tf.constant(text_featurizer.blank, dtype=tf.int32))
    tflitemodel.set_tensor(
        input_details[2]["index"],
        tf.zeros([
            config.model_config["prediction_num_rnns"], 2, 1,
            config.model_config["prediction_rnn_units"]
        ],
                 dtype=tf.float32))
    tflitemodel.invoke()
    hyp = tflitemodel.get_tensor(output_details[0]["index"])

    logger.info(hyp)
def main():
    parser = argparse.ArgumentParser(prog="SelfAttentionDS2 Histogram")

    parser.add_argument("--config", type=str, default=None,
                        help="Config file")

    parser.add_argument("--audio", type=str, default=None,
                        help="Audio file")

    parser.add_argument("--saved_model", type=str, default=None,
                        help="Saved model")

    parser.add_argument("--from_weights", type=bool, default=False,
                        help="Load from weights")

    parser.add_argument("--output", type=str, default=None,
                        help="Output dir storing histograms")

    args = parser.parse_args()

    config = UserConfig(args.config, args.config, learning=False)
    speech_featurizer = SpeechFeaturizer(config["speech_config"])
    text_featurizer = CharFeaturizer(config["decoder_config"])
    text_featurizer.add_scorer(Scorer(**text_featurizer.decoder_config["lm_config"],
                                      vocabulary=text_featurizer.vocab_array))

    f, c = speech_featurizer.compute_feature_dim()
    satt_ds2_model = SelfAttentionDS2(input_shape=[None, f, c],
                                      arch_config=config["model_config"],
                                      num_classes=text_featurizer.num_classes)
    satt_ds2_model._build([1, 50, f, c])

    if args.from_weights:
        satt_ds2_model.load_weights(args.saved_model)
    else:
        saved_model = tf.keras.models.load_model(args.saved_model)
        satt_ds2_model.set_weights(saved_model.get_weights())

    satt_ds2_model.summary(line_length=100)

    satt_ds2_model.add_featurizers(speech_featurizer, text_featurizer)

    signal = read_raw_audio(args.audio, speech_featurizer.sample_rate)
    features = speech_featurizer.extract(signal)
    decoded = satt_ds2_model.recognize_beam(tf.expand_dims(features, 0), lm=True)
    print(bytes_to_string(decoded.numpy()))

    for i in range(1, len(satt_ds2_model.base_model.layers)):
        func = tf.keras.backend.function([satt_ds2_model.base_model.input],
                                         [satt_ds2_model.base_model.layers[i].output])
        data = func([np.expand_dims(features, 0), 1])[0][0]
        print(data.shape)
        data = data.flatten()
        plt.hist(data, 200, color='green', histtype="stepfilled")
        plt.title(f"Output of {satt_ds2_model.base_model.layers[i].name}", fontweight="bold")
        plt.savefig(os.path.join(
            args.output, f"{i}_{satt_ds2_model.base_model.layers[i].name}.png"))
        plt.clf()
        plt.cla()
        plt.close()

    fc = satt_ds2_model(tf.expand_dims(features, 0), training=False)
    plt.hist(fc[0].numpy().flatten(), 200, color="green", histtype="stepfilled")
    plt.title(f"Output of {satt_ds2_model.layers[-1].name}", fontweight="bold")
    plt.savefig(os.path.join(args.output, f"{satt_ds2_model.layers[-1].name}.png"))
    plt.clf()
    plt.cla()
    plt.close()
    fc = tf.nn.softmax(fc)
    plt.hist(fc[0].numpy().flatten(), 10, color="green", histtype="stepfilled")
    plt.title("Output of softmax", fontweight="bold")
    plt.savefig(os.path.join(args.output, "softmax_hist.png"))
    plt.clf()
    plt.cla()
    plt.close()
    plt.hist(features.flatten(), 200, color="green", histtype="stepfilled")
    plt.title("Log Mel Spectrogram", fontweight="bold")
    plt.savefig(os.path.join(args.output, "log_mel_spectrogram.png"))
    plt.clf()
    plt.cla()
    plt.close()
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer

txf = CharFeaturizer(None, blank_at_zero=True)

a = txf.extract("fkaff aksfbfnak kcjhoiu")

print(a)
args = parser.parse_args()

tf.config.optimizer.set_experimental_options(
    {"auto_mixed_precision": args.mxp})

strategy = setup_strategy(args.devices)

from tensorflow_asr.configs.config import Config
from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
from tensorflow_asr.models.keras.jasper import Jasper

config = Config(args.config)
speech_featurizer = TFSpeechFeaturizer(config.speech_config)
text_featurizer = CharFeaturizer(config.decoder_config)

if args.tfrecords:
    train_dataset = ASRTFRecordDatasetKeras(
        speech_featurizer=speech_featurizer,
        text_featurizer=text_featurizer,
        **vars(config.learning_config.train_dataset_config),
        indefinite=True)
    eval_dataset = ASRTFRecordDatasetKeras(
        speech_featurizer=speech_featurizer,
        text_featurizer=text_featurizer,
        **vars(config.learning_config.eval_dataset_config))
    # Update metadata calculated from both train and eval datasets
    train_dataset.load_metadata(args.metadata_prefix)
    eval_dataset.load_metadata(args.metadata_prefix)
    # Use dynamic length
示例#10
0
def main():
    parser = argparse.ArgumentParser(prog="SelfAttentionDS2 Histogram")

    parser.add_argument("--config", type=str, default=None, help="Config file")

    parser.add_argument("--audio", type=str, default=None, help="Audio file")

    parser.add_argument("--saved_model",
                        type=str,
                        default=None,
                        help="Saved model")

    parser.add_argument("--from_weights",
                        type=bool,
                        default=False,
                        help="Load from weights")

    parser.add_argument("--output",
                        type=str,
                        default=None,
                        help="Output dir storing histograms")

    args = parser.parse_args()

    config = UserConfig(args.config, args.config, learning=False)
    speech_featurizer = SpeechFeaturizer(config["speech_config"])
    text_featurizer = CharFeaturizer(config["decoder_config"])
    text_featurizer.add_scorer(
        Scorer(**text_featurizer.decoder_config["lm_config"],
               vocabulary=text_featurizer.vocab_array))

    f, c = speech_featurizer.compute_feature_dim()
    satt_ds2_model = SelfAttentionDS2(input_shape=[None, f, c],
                                      arch_config=config["model_config"],
                                      num_classes=text_featurizer.num_classes)
    satt_ds2_model._build([1, 50, f, c])

    if args.from_weights:
        satt_ds2_model.load_weights(args.saved_model)
    else:
        saved_model = tf.keras.models.load_model(args.saved_model)
        satt_ds2_model.set_weights(saved_model.get_weights())

    satt_ds2_model.summary(line_length=100)

    satt_ds2_model.add_featurizers(speech_featurizer, text_featurizer)

    signal = read_raw_audio(args.audio, speech_featurizer.sample_rate)
    features = speech_featurizer.extract(signal)
    decoded = satt_ds2_model.recognize_beam(tf.expand_dims(features, 0),
                                            lm=True)
    print(bytes_to_string(decoded.numpy()))

    # for i in range(1, len(satt_ds2_model.base_model.layers)):
    #     func = tf.keras.backend.function([satt_ds2_model.base_model.input],
    #                                      [satt_ds2_model.base_model.layers[i].output])
    #     data = func([np.expand_dims(features, 0), 1])[0][0]
    #     print(data.shape)
    #     plt.figure(figsize=(16, 5))
    #     ax = plt.gca()
    #     im = ax.imshow(data.T, origin="lower", aspect="auto")
    #     ax.set_title(f"{satt_ds2_model.base_model.layers[i].name}", fontweight="bold")
    #     divider = make_axes_locatable(ax)
    #     cax = divider.append_axes("right", size="5%", pad=0.05)
    #     plt.colorbar(im, cax=cax)
    #     plt.savefig(os.path.join(
    #         args.output, f"{i}_{satt_ds2_model.base_model.layers[i].name}.png"))
    #     plt.clf()
    #     plt.cla()
    #     plt.close()

    fc = satt_ds2_model(tf.expand_dims(features, 0), training=False)
    plt.figure(figsize=(16, 5))
    ax = plt.gca()
    ax.set_title(f"{satt_ds2_model.layers[-1].name}", fontweight="bold")
    im = ax.imshow(fc[0].numpy().T, origin="lower", aspect="auto")
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
    plt.colorbar(im, cax=cax)
    plt.savefig(
        os.path.join(args.output, f"{satt_ds2_model.layers[-1].name}.png"))
    plt.clf()
    plt.cla()
    plt.close()
    fc = tf.nn.softmax(fc)
    plt.figure(figsize=(16, 5))
    ax = plt.gca()
    ax.set_title("Softmax", fontweight="bold")
    im = ax.imshow(fc[0].numpy().T, origin="lower", aspect="auto")
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
    plt.colorbar(im, cax=cax)
    plt.savefig(os.path.join(args.output, "softmax.png"))
    plt.clf()
    plt.cla()
    plt.close()
    plt.figure(figsize=(16, 5))
    ax = plt.gca()
    ax.set_title("Log Mel Spectrogram", fontweight="bold")
    im = ax.imshow(features[:, :, 0].T, origin="lower", aspect="auto")
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
    plt.colorbar(im, cax=cax)
    plt.savefig(os.path.join(args.output, "features.png"))
    plt.clf()
    plt.cla()
    plt.close()
示例#11
0
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.utils.utils import bytes_to_string, merge_two_last_dims

decoder_config = {
    "vocabulary":
    "/mnt/Projects/asrk16/TiramisuASR/vocabularies/vietnamese.txt",
    "beam_width": 100,
    "blank_at_zero": False,
    "lm_config": {
        "model_path": "/mnt/Data/ML/NLP/vntc_asrtrain_5gram_trie.binary",
        "alpha": 2.0,
        "beta": 2.0
    }
}
text_featurizer = CharFeaturizer(decoder_config)
text_featurizer.add_scorer(
    Scorer(**decoder_config["lm_config"],
           vocabulary=text_featurizer.vocab_array))
speech_featurizer = TFSpeechFeaturizer({
    "sample_rate": 16000,
    "frame_ms": 25,
    "stride_ms": 10,
    "num_feature_bins": 80,
    "feature_type": "spectrogram",
    "preemphasis": 0.97,
    # "delta": True,
    # "delta_delta": True,
    "normalize_signal": True,
    "normalize_feature": True,
    "normalize_per_feature": False,