from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer from tensorflow_asr.models.keras.contextnet import ContextNet from tensorflow_asr.optimizers.schedules import TransformerSchedule config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) if args.sentence_piece: print("Loading SentencePiece model ...") text_featurizer = SentencePieceFeaturizer.load_from_file( config.decoder_config, args.subwords) elif args.subwords and os.path.exists(args.subwords): print("Loading subwords ...") text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) else: print("Generating subwords ...") text_featurizer = SubwordFeaturizer.build_from_corpus( config.decoder_config, corpus_files=args.subwords_corpus) text_featurizer.save_to_file(args.subwords) train_dataset = ASRTFRecordDatasetKeras( speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, **vars(config.learning_config.train_dataset_config), indefinite=True) eval_dataset = ASRTFRecordDatasetKeras( speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, **vars(config.learning_config.eval_dataset_config),
setup_devices([args.device], cpu=args.cpu) from tensorflow_asr.configs.config import Config from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer from tensorflow_asr.runners.base_runners import BaseTester from tensorflow_asr.models.conformer import Conformer config = Config(args.config, learning=True) speech_featurizer = TFSpeechFeaturizer(config.speech_config) if args.subwords and os.path.exists(args.subwords): print("Loading subwords ...") text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) else: raise ValueError("subwords must be set") tf.random.set_seed(0) assert args.saved if args.tfrecords: test_dataset = ASRTFRecordDataset( data_paths=config.learning_config.dataset_config.test_paths, tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, stage="test", shuffle=False) else:
DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") parser = argparse.ArgumentParser(prog="Vocab Training with Subwords") parser.add_argument("corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") parser.add_argument("--output_file", type=str, default=None, help="Path to file that stores generated subwords") args = parser.parse_args() config = Config(args.config) print("Generating subwords ...") text_featurizer = SubwordFeaturizer.build_from_corpus(config.decoder_config, args.corpus) text_featurizer.save_to_file(args.output_file)
parser.add_argument("--subwords", type=str, default=None, help="Use subwords") parser.add_argument("output", type=str, default=None, help="TFLite file path to be exported") args = parser.parse_args() assert args.saved and args.output config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) if args.subwords: text_featurizer = SubwordFeaturizer(config.decoder_config) else: text_featurizer = CharFeaturizer(config.decoder_config) # build model jasper = Jasper(**config.model_config, vocabulary_size=text_featurizer.num_classes) jasper.make(speech_featurizer.shape) jasper.load_weights(args.saved, by_name=True) jasper.summary(line_length=100) jasper.add_featurizers(speech_featurizer, text_featurizer) concrete_func = jasper.make_tflite_function().get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func]) converter.experimental_new_converter = True converter.optimizations = [tf.lite.Optimize.DEFAULT]
from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio parser = argparse.ArgumentParser(prog="test subword") parser.add_argument("transcripts", nargs="+", type=str, default=[None]) args = parser.parse_args() config = { "vocabulary": None, "target_vocab_size": 1024, "max_subword_length": 4, "blank_at_zero": True, "beam_width": 5, "norm_score": True } text_featurizer = SubwordFeaturizer.build_from_corpus(config, args.transcripts) print(len(text_featurizer.subwords.subwords)) print(text_featurizer.upoints) print(text_featurizer.num_classes) a = text_featurizer.extract("hello world") print(a) b = text_featurizer.indices2upoints(a) tf.print(tf.strings.unicode_encode(b, "UTF-8"))
def main(): parser = argparse.ArgumentParser(prog="Conformer Training") parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps") parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") parser.add_argument( "--train-dir", '-td', nargs='*', default=["en_ng_male_train.tsv", "en_ng_female_train.tsv"]) parser.add_argument("--train-reg-dir", '-trd', nargs='*', default=[ "libritts_train-clean-100.tsv", "libritts_train-clean-360.tsv", "libritts_train-other-500.tsv" ]) parser.add_argument( "--dev-dir", '-dd', nargs='*', default=["en_ng_male_eval.tsv", "en_ng_female_eval.tsv"]) parser.add_argument("--dev-reg-dir", '-drd', nargs='*', default=["libritts_test-other.tsv"]) args = parser.parse_args() tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": args.mxp}) strategy = setup_strategy(args.devices) config = Config(args.config, learning=True) config.train_dir = args.train_dir config.dev_dir = args.dev_dir config.train_reg_dir = args.train_reg_dir config.dev_reg_dir = args.dev_reg_dir with open(config.speech_config) as f: speech_config = yaml.load(f, Loader=yaml.Loader) speech_featurizer = TFSpeechFeaturizer(speech_config) if args.subwords and os.path.exists(args.subwords): print("Loading subwords ...") text_featurizer = SubwordFeaturizer.load_from_file( config.decoder_config, args.subwords) else: print("Generating subwords ...") text_featurizer = SubwordFeaturizer.build_from_corpus( config.decoder_config, corpus_files=args.subwords_corpus) text_featurizer.save_to_file(args.subwords) train_dataset = Dataset(data_paths=config.train_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, augmentations=config.learning_config.augmentations, stage="train", cache=False, shuffle=False) train_reg_dataset = DatasetInf( data_paths=config.train_reg_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, augmentations=config.learning_config.augmentations, stage="train", cache=False, shuffle=False) eval_dataset = Dataset(data_paths=config.dev_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, stage="eval", cache=False, shuffle=False) eval_reg_dataset = DatasetInf( data_paths=config.dev_reg_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, augmentations=config.learning_config.augmentations, stage="eval", cache=False, shuffle=False) conformer_trainer = MultiReaderTransducerTrainer( config=config.learning_config.running_config, text_featurizer=text_featurizer, strategy=strategy) with conformer_trainer.strategy.scope(): # build model conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer._build(speech_featurizer.shape) conformer.summary(line_length=120) optimizer = tf.keras.optimizers.Adam( TransformerSchedule(d_model=conformer.dmodel, warmup_steps=config.learning_config. optimizer_config["warmup_steps"], max_lr=(0.05 / math.sqrt(conformer.dmodel))), beta_1=config.learning_config.optimizer_config["beta1"], beta_2=config.learning_config.optimizer_config["beta2"], epsilon=config.learning_config.optimizer_config["epsilon"]) conformer_trainer.compile(model=conformer, optimizer=optimizer, max_to_keep=args.max_ckpts) conformer_trainer.fit( train_dataset, train_reg_dataset, # alpha for regularising dataset; alpha = 1 for training dataset 1., eval_dataset, eval_reg_dataset, train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
def process(text): encoded_output = subword.extract(text.decode('utf-8')) encoded_input = subword.prepand_blank(encoded_output) encoded_output = tf.concat([encoded_output, [subword.blank]], axis=0) assert encoded_input.shape == encoded_output.shape return encoded_input, encoded_output @tf.function def parse(record): return tf.numpy_function(process, inp=[record], Tout=[tf.int32, tf.int32]) config = Config('config.yml', learning=True) subword = SubwordFeaturizer.load_from_file( config.decoder_config, '/home/joaoalvarenga/datasets/conformer_subwords.subwords') checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( 'checkpoint/lm.ckpt', save_weights_only=True, verbose=1) print(subword.num_classes) batch_size = 32 dataset = tf.data.TextLineDataset( '/media/work/joaoalvarenga/ptwiki-20181125.txt') dataset = dataset.map(parse) dataset = dataset.cache() # dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.padded_batch(batch_size=batch_size, padded_shapes=(tf.TensorShape([None]), tf.TensorShape([None])), padding_values=(subword.blank, subword.blank),