def test_featurizer(): config = { "output_path_prefix": "/data/models/asr/conformer_sentencepiece_subword", "model_type": "unigram", "target_vocab_size": 8000, "blank_at_zero": True, "beam_width": 5, "norm_score": True, "corpus_files": [ "/data/datasets/LibriSpeech/train-clean-100/transcripts.tsv" "/data/datasets/LibriSpeech/train-clean-360/transcripts.tsv" "/data/datasets/LibriSpeech/train-other-500/transcripts.tsv" ] } config_speech = { "sample_rate": 16000, "frame_ms": 25, "stride_ms": 10, "num_feature_bins": 80, 'feature_type': "log_mel_spectrogram", "preemphasis": 0.97, "normalize_signal": True, "normalize_feature": True, "normalize_per_feature": False } text_featurizer_sentencepiece = SentencePieceFeaturizer.load_from_file( config, None) subwords_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), os.pardir, os.pardir, "vocabularies", "librispeech_train_4_1030.subwords") text_featurizer_subwords = SubwordFeaturizer.load_from_file( config, subwords_path) speech_featurizer = TFSpeechFeaturizer(config_speech) data_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "transcripts_librispeech_train_clean_100.tsv") def get_data(featurizer: TextFeaturizer): train_dataset = ASRSliceDataset(data_paths=[data_path], speech_featurizer=speech_featurizer, text_featurizer=featurizer, stage="train", shuffle=False) train_data = train_dataset.create(1) return next(iter(train_data)) data_sentencepiece = get_data(text_featurizer_sentencepiece) data_subwords = get_data(text_featurizer_subwords) assert len(data_sentencepiece) == len(data_subwords) assert data_sentencepiece[0].shape == data_subwords[0].shape assert data_sentencepiece[0].dtype == data_subwords[0].dtype
def test_iextract(): config = { "output_path_prefix": "/data/models/asr/conformer_sentencepiece_subword", "model_type": "unigram", "target_vocab_size": 8000, "blank_at_zero": True, "beam_width": 5, "norm_score": True, "corpus_files": [ "/data/datasets/LibriSpeech/train-clean-100/transcripts.tsv" "/data/datasets/LibriSpeech/train-clean-360/transcripts.tsv" "/data/datasets/LibriSpeech/train-other-500/transcripts.tsv"]} config_speech = { "sample_rate": 16000, "frame_ms": 25, "stride_ms": 10, "num_feature_bins": 80, 'feature_type': "log_mel_spectrogram", "preemphasis": 0.97, "normalize_signal": True, "normalize_feature": True, "normalize_per_frame": False} text_featurizer_sentencepiece = SentencePieceFeaturizer.load_from_file(config, None) speech_featurizer = TFSpeechFeaturizer(config_speech) data_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "transcripts_librispeech_train_clean_100.tsv") train_dataset = ASRSliceTestDataset( data_paths=[data_path], speech_featurizer=speech_featurizer, text_featurizer=text_featurizer_sentencepiece, stage="train", shuffle=False ) train_data = train_dataset.create(1) batch = next(iter(train_data)) file_paths, features, input_length, labels = batch labels = text_featurizer_sentencepiece.iextract(labels) labels = labels.numpy()[0].decode("utf-8") # Open transcript file_path = file_paths[0].numpy().decode("utf-8") file_path = re.sub(r"(?<!\s)-[0-9]{4}.flac", ".trans.txt", file_path) print(file_path) with open(file_path, "r") as f: lines = f.read().splitlines() m = re.search(r"[0-9]+-[0-9]+-[0-9]+\s+([\w\s]+)", lines[0]) transcript = m.groups(1)[0].lower() assert(labels == transcript)
parser.add_argument("transcripts", nargs="+", type=str, default=None, help="Paths to transcript files") args = parser.parse_args() transcripts = preprocess_paths(args.transcripts) tfrecords_dir = preprocess_paths(args.tfrecords_dir) config = Config(args.config) if args.sentence_piece: print("Loading SentencePiece model ...") text_featurizer = SentencePieceFeaturizer.load_from_file( config.decoder_config, args.subwords) elif args.subwords and os.path.exists(args.subwords): print("Loading subwords ...") text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) ASRTFRecordDataset(data_paths=transcripts, tfrecords_dir=tfrecords_dir, speech_featurizer=None, text_featurizer=text_featurizer, stage=args.mode, shuffle=args.shuffle, tfrecords_shards=args.tfrecords_shards).create_tfrecords()
env_util.setup_devices([args.device], cpu=args.cpu) from tensorflow_asr.configs.config import Config from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer from tensorflow_asr.models.ctc.jasper import Jasper from tensorflow_asr.utils import app_util config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) if args.sentence_piece: print("Use SentencePiece ...") text_featurizer = SentencePieceFeaturizer(config.decoder_config) elif args.subwords: print("Use subwords ...") text_featurizer = SubwordFeaturizer(config.decoder_config) else: print("Use characters ...") text_featurizer = CharFeaturizer(config.decoder_config) tf.random.set_seed(0) if args.tfrecords: test_dataset = ASRTFRecordDataset( speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, **vars(config.learning_config.test_dataset_config)) else:
DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") tf.keras.backend.clear_session() parser = argparse.ArgumentParser(prog="Vocab Training with SentencePiece") parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") args = parser.parse_args() strategy = setup_strategy(args.devices) from tensorflow_asr.configs.config import Config from tensorflow_asr.featurizers.text_featurizers import SentencePieceFeaturizer config = Config(args.config) logger.info("Generating subwords ...") text_featurizer = SentencePieceFeaturizer.build_from_corpus( config.decoder_config)