def __init__(self, path='ConformerS.h5'): # fetch and load the config of the model config = Config('tamil_tech/configs/conformer_new_config.yml', learning=True) # load speech and text featurizers speech_featurizer = TFSpeechFeaturizer(config.speech_config) text_featurizer = CharFeaturizer(config.decoder_config) # check if model already exists in given path, else download the model in the given path if os.path.exists(path): pass else: print("Downloading Model...") file_id = config.file_id download_file_from_google_drive(file_id, path) print("Downloaded Model Successfully...") # load model using config self.model = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) # set shape of the featurizer and build the model self.model._build(speech_featurizer.shape) # load weights of the model self.model.load_weights(path, by_name=True) # display model summary self.model.summary(line_length=120) # set featurizers for the model self.model.add_featurizers(speech_featurizer, text_featurizer) print("Loaded Model...!")
def test_ds2(): config = Config(DEFAULT_YAML) text_featurizer = CharFeaturizer(config.decoder_config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) model = DeepSpeech2(vocabulary_size=text_featurizer.num_classes, **config.model_config) model._build(speech_featurizer.shape) model.summary(line_length=150) model.add_featurizers(speech_featurizer=speech_featurizer, text_featurizer=text_featurizer) concrete_func = model.make_tflite_function(greedy=False).get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.experimental_new_converter = True converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] converter.convert() print("Converted successfully with beam search") concrete_func = model.make_tflite_function(greedy=True).get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.experimental_new_converter = True converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] converter.convert() print("Converted successfully with greedy")
"num_masks": 10, "mask_factor": 100, "p_upperbound": 0.05 }, "freq_masking": { "mask_factor": 27 } }, "include_original": False } data = "/mnt/Data/ML/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv" text_featurizer = CharFeaturizer({ "vocabulary": None, "blank_at_zero": True, "beam_width": 5, "norm_score": True }) speech_featurizer = TFSpeechFeaturizer({ "sample_rate": 16000, "frame_ms": 25, "stride_ms": 10, "num_feature_bins": 80, "feature_type": "log_mel_spectrogram", "preemphasis": 0.97, "normalize_signal": True, "normalize_feature": True, "normalize_per_feature": False })
from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer from tensorflow_asr.models.ctc.jasper import Jasper from tensorflow_asr.utils import app_util config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) if args.sentence_piece: print("Use SentencePiece ...") text_featurizer = SentencePieceFeaturizer(config.decoder_config) elif args.subwords: print("Use subwords ...") text_featurizer = SubwordFeaturizer(config.decoder_config) else: print("Use characters ...") text_featurizer = CharFeaturizer(config.decoder_config) tf.random.set_seed(0) if args.tfrecords: test_dataset = ASRTFRecordDataset( speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, **vars(config.learning_config.test_dataset_config)) else: test_dataset = ASRSliceDataset( speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, **vars(config.learning_config.test_dataset_config)) # build model
setup_devices([args.device]) from tensorflow_asr.configs.user_config import UserConfig from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer from tensorflow_asr.runners.base_runners import BaseTester from model import DeepSpeech2 tf.random.set_seed(0) assert args.export config = UserConfig(DEFAULT_YAML, args.config, learning=True) speech_featurizer = TFSpeechFeaturizer(config["speech_config"]) text_featurizer = CharFeaturizer(config["decoder_config"]) # Build DS2 model ds2_model = DeepSpeech2(input_shape=speech_featurizer.shape, arch_config=config["model_config"], num_classes=text_featurizer.num_classes, name="deepspeech2") ds2_model._build(speech_featurizer.shape) ds2_model.load_weights(args.saved, by_name=True) ds2_model.summary(line_length=150) ds2_model.add_featurizers(speech_featurizer, text_featurizer) if args.tfrecords: test_dataset = ASRTFRecordDataset( data_paths=config["learning_config"]["dataset_config"]["test_paths"], tfrecords_dir=config["learning_config"]["dataset_config"]["tfrecords_dir"], speech_featurizer=speech_featurizer,
def test_contextnet(): config = Config(DEFAULT_YAML) text_featurizer = CharFeaturizer(config.decoder_config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) model = ContextNet(vocabulary_size=text_featurizer.num_classes, **config.model_config) model.make(speech_featurizer.shape) model.summary(line_length=150) model.add_featurizers(speech_featurizer=speech_featurizer, text_featurizer=text_featurizer) concrete_func = model.make_tflite_function( timestamp=False).get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions( [concrete_func]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.experimental_new_converter = True converter.target_spec.supported_ops = [ tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS ] tflite = converter.convert() logger.info("Converted successfully with no timestamp") concrete_func = model.make_tflite_function( timestamp=True).get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions( [concrete_func]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.experimental_new_converter = True converter.target_spec.supported_ops = [ tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS ] converter.convert() logger.info("Converted successfully with timestamp") tflitemodel = tf.lite.Interpreter(model_content=tflite) signal = tf.random.normal([4000]) input_details = tflitemodel.get_input_details() output_details = tflitemodel.get_output_details() tflitemodel.resize_tensor_input(input_details[0]["index"], [4000]) tflitemodel.allocate_tensors() tflitemodel.set_tensor(input_details[0]["index"], signal) tflitemodel.set_tensor(input_details[1]["index"], tf.constant(text_featurizer.blank, dtype=tf.int32)) tflitemodel.set_tensor( input_details[2]["index"], tf.zeros([ config.model_config["prediction_num_rnns"], 2, 1, config.model_config["prediction_rnn_units"] ], dtype=tf.float32)) tflitemodel.invoke() hyp = tflitemodel.get_tensor(output_details[0]["index"]) logger.info(hyp)
def main(): parser = argparse.ArgumentParser(prog="SelfAttentionDS2 Histogram") parser.add_argument("--config", type=str, default=None, help="Config file") parser.add_argument("--audio", type=str, default=None, help="Audio file") parser.add_argument("--saved_model", type=str, default=None, help="Saved model") parser.add_argument("--from_weights", type=bool, default=False, help="Load from weights") parser.add_argument("--output", type=str, default=None, help="Output dir storing histograms") args = parser.parse_args() config = UserConfig(args.config, args.config, learning=False) speech_featurizer = SpeechFeaturizer(config["speech_config"]) text_featurizer = CharFeaturizer(config["decoder_config"]) text_featurizer.add_scorer(Scorer(**text_featurizer.decoder_config["lm_config"], vocabulary=text_featurizer.vocab_array)) f, c = speech_featurizer.compute_feature_dim() satt_ds2_model = SelfAttentionDS2(input_shape=[None, f, c], arch_config=config["model_config"], num_classes=text_featurizer.num_classes) satt_ds2_model._build([1, 50, f, c]) if args.from_weights: satt_ds2_model.load_weights(args.saved_model) else: saved_model = tf.keras.models.load_model(args.saved_model) satt_ds2_model.set_weights(saved_model.get_weights()) satt_ds2_model.summary(line_length=100) satt_ds2_model.add_featurizers(speech_featurizer, text_featurizer) signal = read_raw_audio(args.audio, speech_featurizer.sample_rate) features = speech_featurizer.extract(signal) decoded = satt_ds2_model.recognize_beam(tf.expand_dims(features, 0), lm=True) print(bytes_to_string(decoded.numpy())) for i in range(1, len(satt_ds2_model.base_model.layers)): func = tf.keras.backend.function([satt_ds2_model.base_model.input], [satt_ds2_model.base_model.layers[i].output]) data = func([np.expand_dims(features, 0), 1])[0][0] print(data.shape) data = data.flatten() plt.hist(data, 200, color='green', histtype="stepfilled") plt.title(f"Output of {satt_ds2_model.base_model.layers[i].name}", fontweight="bold") plt.savefig(os.path.join( args.output, f"{i}_{satt_ds2_model.base_model.layers[i].name}.png")) plt.clf() plt.cla() plt.close() fc = satt_ds2_model(tf.expand_dims(features, 0), training=False) plt.hist(fc[0].numpy().flatten(), 200, color="green", histtype="stepfilled") plt.title(f"Output of {satt_ds2_model.layers[-1].name}", fontweight="bold") plt.savefig(os.path.join(args.output, f"{satt_ds2_model.layers[-1].name}.png")) plt.clf() plt.cla() plt.close() fc = tf.nn.softmax(fc) plt.hist(fc[0].numpy().flatten(), 10, color="green", histtype="stepfilled") plt.title("Output of softmax", fontweight="bold") plt.savefig(os.path.join(args.output, "softmax_hist.png")) plt.clf() plt.cla() plt.close() plt.hist(features.flatten(), 200, color="green", histtype="stepfilled") plt.title("Log Mel Spectrogram", fontweight="bold") plt.savefig(os.path.join(args.output, "log_mel_spectrogram.png")) plt.clf() plt.cla() plt.close()
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer txf = CharFeaturizer(None, blank_at_zero=True) a = txf.extract("fkaff aksfbfnak kcjhoiu") print(a)
args = parser.parse_args() tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": args.mxp}) strategy = setup_strategy(args.devices) from tensorflow_asr.configs.config import Config from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer from tensorflow_asr.models.keras.jasper import Jasper config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) text_featurizer = CharFeaturizer(config.decoder_config) if args.tfrecords: train_dataset = ASRTFRecordDatasetKeras( speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, **vars(config.learning_config.train_dataset_config), indefinite=True) eval_dataset = ASRTFRecordDatasetKeras( speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, **vars(config.learning_config.eval_dataset_config)) # Update metadata calculated from both train and eval datasets train_dataset.load_metadata(args.metadata_prefix) eval_dataset.load_metadata(args.metadata_prefix) # Use dynamic length
def main(): parser = argparse.ArgumentParser(prog="SelfAttentionDS2 Histogram") parser.add_argument("--config", type=str, default=None, help="Config file") parser.add_argument("--audio", type=str, default=None, help="Audio file") parser.add_argument("--saved_model", type=str, default=None, help="Saved model") parser.add_argument("--from_weights", type=bool, default=False, help="Load from weights") parser.add_argument("--output", type=str, default=None, help="Output dir storing histograms") args = parser.parse_args() config = UserConfig(args.config, args.config, learning=False) speech_featurizer = SpeechFeaturizer(config["speech_config"]) text_featurizer = CharFeaturizer(config["decoder_config"]) text_featurizer.add_scorer( Scorer(**text_featurizer.decoder_config["lm_config"], vocabulary=text_featurizer.vocab_array)) f, c = speech_featurizer.compute_feature_dim() satt_ds2_model = SelfAttentionDS2(input_shape=[None, f, c], arch_config=config["model_config"], num_classes=text_featurizer.num_classes) satt_ds2_model._build([1, 50, f, c]) if args.from_weights: satt_ds2_model.load_weights(args.saved_model) else: saved_model = tf.keras.models.load_model(args.saved_model) satt_ds2_model.set_weights(saved_model.get_weights()) satt_ds2_model.summary(line_length=100) satt_ds2_model.add_featurizers(speech_featurizer, text_featurizer) signal = read_raw_audio(args.audio, speech_featurizer.sample_rate) features = speech_featurizer.extract(signal) decoded = satt_ds2_model.recognize_beam(tf.expand_dims(features, 0), lm=True) print(bytes_to_string(decoded.numpy())) # for i in range(1, len(satt_ds2_model.base_model.layers)): # func = tf.keras.backend.function([satt_ds2_model.base_model.input], # [satt_ds2_model.base_model.layers[i].output]) # data = func([np.expand_dims(features, 0), 1])[0][0] # print(data.shape) # plt.figure(figsize=(16, 5)) # ax = plt.gca() # im = ax.imshow(data.T, origin="lower", aspect="auto") # ax.set_title(f"{satt_ds2_model.base_model.layers[i].name}", fontweight="bold") # divider = make_axes_locatable(ax) # cax = divider.append_axes("right", size="5%", pad=0.05) # plt.colorbar(im, cax=cax) # plt.savefig(os.path.join( # args.output, f"{i}_{satt_ds2_model.base_model.layers[i].name}.png")) # plt.clf() # plt.cla() # plt.close() fc = satt_ds2_model(tf.expand_dims(features, 0), training=False) plt.figure(figsize=(16, 5)) ax = plt.gca() ax.set_title(f"{satt_ds2_model.layers[-1].name}", fontweight="bold") im = ax.imshow(fc[0].numpy().T, origin="lower", aspect="auto") divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.05) plt.colorbar(im, cax=cax) plt.savefig( os.path.join(args.output, f"{satt_ds2_model.layers[-1].name}.png")) plt.clf() plt.cla() plt.close() fc = tf.nn.softmax(fc) plt.figure(figsize=(16, 5)) ax = plt.gca() ax.set_title("Softmax", fontweight="bold") im = ax.imshow(fc[0].numpy().T, origin="lower", aspect="auto") divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.05) plt.colorbar(im, cax=cax) plt.savefig(os.path.join(args.output, "softmax.png")) plt.clf() plt.cla() plt.close() plt.figure(figsize=(16, 5)) ax = plt.gca() ax.set_title("Log Mel Spectrogram", fontweight="bold") im = ax.imshow(features[:, :, 0].T, origin="lower", aspect="auto") divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.05) plt.colorbar(im, cax=cax) plt.savefig(os.path.join(args.output, "features.png")) plt.clf() plt.cla() plt.close()
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.utils.utils import bytes_to_string, merge_two_last_dims decoder_config = { "vocabulary": "/mnt/Projects/asrk16/TiramisuASR/vocabularies/vietnamese.txt", "beam_width": 100, "blank_at_zero": False, "lm_config": { "model_path": "/mnt/Data/ML/NLP/vntc_asrtrain_5gram_trie.binary", "alpha": 2.0, "beta": 2.0 } } text_featurizer = CharFeaturizer(decoder_config) text_featurizer.add_scorer( Scorer(**decoder_config["lm_config"], vocabulary=text_featurizer.vocab_array)) speech_featurizer = TFSpeechFeaturizer({ "sample_rate": 16000, "frame_ms": 25, "stride_ms": 10, "num_feature_bins": 80, "feature_type": "spectrogram", "preemphasis": 0.97, # "delta": True, # "delta_delta": True, "normalize_signal": True, "normalize_feature": True, "normalize_per_feature": False,