def run(args): config = UserConfig(DEFAULT_YAML, args.config, learning=True) speech_featurizer = TFSpeechFeaturizer(config["speech_config"]) text_featurizer = TextFeaturizer(config["decoder_config"]) tf.random.set_seed(0) assert args.saved_model if args.tfrecords: test_dataset = ASRTFRecordDataset( config["learning_config"]["dataset_config"]["test_paths"], config["learning_config"]["dataset_config"]["tfrecords_dir"], speech_featurizer, text_featurizer, "test", augmentations=config["learning_config"]["augmentations"], shuffle=False).create( config["learning_config"]["running_config"]["batch_size"]) else: test_dataset = ASRSliceDataset( stage="test", speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, data_paths=config["learning_config"]["dataset_config"] ["eval_paths"], shuffle=False).create( config["learning_config"]["running_config"]["batch_size"]) # build model f, c = speech_featurizer.compute_feature_dim() conformer = Conformer(vocabulary_size=text_featurizer.num_classes, **config["model_config"]) conformer._build([1, 50, f, c]) conformer.summary(line_length=100) conformer_tester = BaseTester( config=config["learning_config"]["running_config"], saved_path=args.saved_model, from_weights=args.from_weights) conformer_tester.compile(conformer, speech_featurizer, text_featurizer) conformer_tester.run(test_dataset)
parser.add_argument("--saved", type=str, default=None, help="Path to saved model") parser.add_argument("output", type=str, default=None, help="TFLite file path to be exported") args = parser.parse_args() assert args.saved and args.output config = UserConfig(DEFAULT_YAML, args.config, learning=True) speech_featurizer = TFSpeechFeaturizer(config["speech_config"]) text_featurizer = CharFeaturizer(config["decoder_config"]) # build model conformer = Conformer(**config["model_config"], vocabulary_size=text_featurizer.num_classes) conformer._build(speech_featurizer.shape) conformer.load_weights(args.saved) conformer.summary(line_length=150) conformer.add_featurizers(speech_featurizer, text_featurizer) concrete_func = conformer.make_tflite_function( greedy=True).get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func]) converter.optimizations = [tf.lite.Optimize.DEFAULT]
def main(): parser = argparse.ArgumentParser(prog="SelfAttentionDS2 Histogram") parser.add_argument("--config", type=str, default=None, help="Config file") parser.add_argument("--audio", type=str, default=None, help="Audio file") parser.add_argument("--saved_model", type=str, default=None, help="Saved model") parser.add_argument("--from_weights", type=bool, default=False, help="Load from weights") parser.add_argument("--output", type=str, default=None, help="Output dir storing histograms") args = parser.parse_args() config = UserConfig(args.config, args.config, learning=False) speech_featurizer = SpeechFeaturizer(config["speech_config"]) text_featurizer = TextFeaturizer(config["decoder_config"]) text_featurizer.add_scorer(Scorer(**text_featurizer.decoder_config["lm_config"], vocabulary=text_featurizer.vocab_array)) f, c = speech_featurizer.compute_feature_dim() satt_ds2_model = SelfAttentionDS2(input_shape=[None, f, c], arch_config=config["model_config"], num_classes=text_featurizer.num_classes) satt_ds2_model._build([1, 50, f, c]) if args.from_weights: satt_ds2_model.load_weights(args.saved_model) else: saved_model = tf.keras.models.load_model(args.saved_model) satt_ds2_model.set_weights(saved_model.get_weights()) satt_ds2_model.summary(line_length=100) satt_ds2_model.add_featurizers(speech_featurizer, text_featurizer) signal = read_raw_audio(args.audio, speech_featurizer.sample_rate) features = speech_featurizer.extract(signal) decoded = satt_ds2_model.recognize_beam(tf.expand_dims(features, 0), lm=True) print(bytes_to_string(decoded.numpy())) for i in range(1, len(satt_ds2_model.base_model.layers)): func = tf.keras.backend.function([satt_ds2_model.base_model.input], [satt_ds2_model.base_model.layers[i].output]) data = func([np.expand_dims(features, 0), 1])[0][0] print(data.shape) data = data.flatten() plt.hist(data, 200, color='green', histtype="stepfilled") plt.title(f"Output of {satt_ds2_model.base_model.layers[i].name}", fontweight="bold") plt.savefig(os.path.join( args.output, f"{i}_{satt_ds2_model.base_model.layers[i].name}.png")) plt.clf() plt.cla() plt.close() fc = satt_ds2_model(tf.expand_dims(features, 0), training=False) plt.hist(fc[0].numpy().flatten(), 200, color="green", histtype="stepfilled") plt.title(f"Output of {satt_ds2_model.layers[-1].name}", fontweight="bold") plt.savefig(os.path.join(args.output, f"{satt_ds2_model.layers[-1].name}.png")) plt.clf() plt.cla() plt.close() fc = tf.nn.softmax(fc) plt.hist(fc[0].numpy().flatten(), 10, color="green", histtype="stepfilled") plt.title("Output of softmax", fontweight="bold") plt.savefig(os.path.join(args.output, "softmax_hist.png")) plt.clf() plt.cla() plt.close() plt.hist(features.flatten(), 200, color="green", histtype="stepfilled") plt.title("Log Mel Spectrogram", fontweight="bold") plt.savefig(os.path.join(args.output, "log_mel_spectrogram.png")) plt.clf() plt.cla() plt.close()
def run(args): config = UserConfig(DEFAULT_YAML, args.config, learning=True) speech_featurizer = TFSpeechFeaturizer(config["speech_config"]) text_featurizer = TextFeaturizer(config["decoder_config"]) tf.random.set_seed(2020) if args.mixed_precision: policy = tf.keras.mixed_precision.experimental.Policy( "mixed_float16") tf.keras.mixed_precision.experimental.set_policy(policy) print("Enabled mixed precision training") if args.tfrecords: train_dataset = ASRTFRecordDataset( config["learning_config"]["dataset_config"]["train_paths"], config["learning_config"]["dataset_config"]["tfrecords_dir"], speech_featurizer, text_featurizer, "train", augmentations=config["learning_config"]["augmentations"], shuffle=True, ) eval_dataset = ASRTFRecordDataset( config["learning_config"]["dataset_config"]["eval_paths"], config["learning_config"]["dataset_config"]["tfrecords_dir"], speech_featurizer, text_featurizer, "eval", shuffle=False) else: train_dataset = ASRSliceDataset( stage="train", speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, data_paths=config["learning_config"]["dataset_config"] ["train_paths"], augmentations=config["learning_config"]["augmentations"], shuffle=True, ) eval_dataset = ASRSliceDataset(stage="eval", speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, data_paths=config["learning_config"] ["dataset_config"]["eval_paths"], shuffle=False) conformer_trainer = TransducerTrainer( config=config["learning_config"]["running_config"], text_featurizer=text_featurizer, is_mixed_precision=args.mixed_precision) with conformer_trainer.strategy.scope(): # build model f, c = speech_featurizer.compute_feature_dim() conformer = Conformer(**config["model_config"], vocabulary_size=text_featurizer.num_classes) conformer._build([1, 50, f, c]) optimizer_config = config["learning_config"]["optimizer_config"] optimizer = tf.keras.optimizers.Adam( TransformerSchedule( d_model=config["model_config"]["dmodel"], warmup_steps=optimizer_config["warmup_steps"], max_lr=(0.05 / math.sqrt(config["model_config"]["dmodel"]))), beta_1=float(optimizer_config["beta1"]), beta_2=float(optimizer_config["beta2"]), epsilon=float(optimizer_config["epsilon"])) conformer_trainer.compile(model=conformer, optimizer=optimizer, max_to_keep=args.max_ckpts) conformer_trainer.fit(train_dataset, eval_dataset, args.eval_train_ratio) if args.export: if args.from_weights: conformer_trainer.model.save_weights(args.export) else: conformer_trainer.model.save(args.export)
def run(args): assert args.mode in modes, f"Mode must in {modes}" config = UserConfig(DEFAULT_YAML, args.config, learning=True) speech_featurizer = SpeechFeaturizer(config["speech_config"]) text_featurizer = TextFeaturizer(config["decoder_config"]) if args.mode == "train": tf.random.set_seed(2020) if args.mixed_precision: policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16") tf.keras.mixed_precision.experimental.set_policy(policy) print("Enabled mixed precision training") ctc_trainer = CTCTrainer(speech_featurizer, text_featurizer, config["learning_config"]["running_config"], args.mixed_precision) if args.tfrecords: train_dataset = ASRTFRecordDataset( config["learning_config"]["dataset_config"]["train_paths"], config["learning_config"]["dataset_config"]["tfrecords_dir"], speech_featurizer, text_featurizer, "train", augmentations=config["learning_config"]["augmentations"], shuffle=True, ) eval_dataset = ASRTFRecordDataset( config["learning_config"]["dataset_config"]["eval_paths"], config["learning_config"]["dataset_config"]["tfrecords_dir"], speech_featurizer, text_featurizer, "eval", shuffle=False ) else: train_dataset = ASRSliceDataset( stage="train", speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, data_paths=config["learning_config"]["dataset_config"]["train_paths"], augmentations=config["learning_config"]["augmentations"], shuffle=True, ) eval_dataset = ASRSliceDataset( stage="eval", speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, data_paths=config["learning_config"]["dataset_config"]["eval_paths"], shuffle=False ) # Build DS2 model f, c = speech_featurizer.compute_feature_dim() with ctc_trainer.strategy.scope(): satt_ds2_model = SelfAttentionDS2(input_shape=[None, f, c], arch_config=config["model_config"], num_classes=text_featurizer.num_classes) satt_ds2_model._build([1, 50, f, c]) optimizer = create_optimizer( name=config["learning_config"]["optimizer_config"]["name"], d_model=config["model_config"]["att"]["head_size"], **config["learning_config"]["optimizer_config"]["config"] ) # Compile ctc_trainer.compile(satt_ds2_model, optimizer, max_to_keep=args.max_ckpts) ctc_trainer.fit(train_dataset, eval_dataset, args.eval_train_ratio) if args.export: if args.from_weights: ctc_trainer.model.save_weights(args.export) else: ctc_trainer.model.save(args.export) elif args.mode == "test": tf.random.set_seed(0) assert args.export text_featurizer.add_scorer( Scorer(**text_featurizer.decoder_config["lm_config"], vocabulary=text_featurizer.vocab_array)) # Build DS2 model f, c = speech_featurizer.compute_feature_dim() satt_ds2_model = SelfAttentionDS2(input_shape=[None, f, c], arch_config=config["model_config"], num_classes=text_featurizer.num_classes) satt_ds2_model._build([1, 50, f, c]) satt_ds2_model.summary(line_length=100) optimizer = create_optimizer( name=config["learning_config"]["optimizer_config"]["name"], d_model=config["model_config"]["att"]["head_size"], **config["learning_config"]["optimizer_config"]["config"] ) batch_size = config["learning_config"]["running_config"]["batch_size"] if args.tfrecords: test_dataset = ASRTFRecordDataset( config["learning_config"]["dataset_config"]["test_paths"], config["learning_config"]["dataset_config"]["tfrecords_dir"], speech_featurizer, text_featurizer, "test", augmentations=config["learning_config"]["augmentations"], shuffle=False ).create(batch_size * args.eval_train_ratio) else: test_dataset = ASRSliceDataset( stage="test", speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, data_paths=config["learning_config"]["dataset_config"]["test_paths"], augmentations=config["learning_config"]["augmentations"], shuffle=False ).create(batch_size * args.eval_train_ratio) ctc_tester = BaseTester( config=config["learning_config"]["running_config"], saved_path=args.export, from_weights=args.from_weights ) ctc_tester.compile(satt_ds2_model, speech_featurizer, text_featurizer) ctc_tester.run(test_dataset) else: assert args.export # Build DS2 model f, c = speech_featurizer.compute_feature_dim() satt_ds2_model = SelfAttentionDS2(input_shape=[None, f, c], arch_config=config["model_config"], num_classes=text_featurizer.num_classes) satt_ds2_model._build([1, 50, f, c]) optimizer = create_optimizer( name=config["learning_config"]["optimizer_config"]["name"], d_model=config["model_config"]["att"]["head_size"], **config["learning_config"]["optimizer_config"]["config"] ) def save_func(**kwargs): if args.from_weights: kwargs["model"].save_weights(args.export) else: kwargs["model"].save(args.export) save_from_checkpoint(func=save_func, outdir=config["learning_config"]["running_config"]["outdir"], model=satt_ds2_model, optimizer=optimizer)
def main(): tf.keras.backend.clear_session() parser = argparse.ArgumentParser(prog="Deep Speech 2 Tester") parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") parser.add_argument("--saved_path", "-e", type=str, default=None, help="Path to the model file to be exported") parser.add_argument("--from_weights", type=bool, default=False, help="Whether to save or load only weights") parser.add_argument("--tfrecords", type=bool, default=False, help="Whether to use tfrecords dataset") parser.add_argument("--batch_size", type=int, default=1, help="Batch size for testing") args = parser.parse_args() tf.random.set_seed(0) assert args.export config = UserConfig(DEFAULT_YAML, args.config, learning=True) speech_featurizer = TFSpeechFeaturizer(config["speech_config"]) text_featurizer = TextFeaturizer(config["decoder_config"]) # Build DS2 model f, c = speech_featurizer.compute_feature_dim() ds2_model = DeepSpeech2(input_shape=[None, f, c], arch_config=config["model_config"], num_classes=text_featurizer.num_classes, name="deepspeech2") ds2_model._build([1, 50, f, c]) ds2_model.summary(line_length=100) if args.tfrecords: test_dataset = ASRTFRecordDataset( config["learning_config"]["dataset_config"]["test_paths"], config["learning_config"]["dataset_config"]["tfrecords_dir"], speech_featurizer, text_featurizer, "test", augmentations=config["learning_config"]["augmentations"], shuffle=False).create(args.batch_size) else: test_dataset = ASRSliceDataset(stage="test", speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, data_paths=config["learning_config"] ["dataset_config"]["eval_paths"], shuffle=False).create(args.batch_size) ctc_tester = BaseTester(config=config["learning_config"]["running_config"], saved_path=args.saved_path, from_weights=args.from_weights) ctc_tester.compile(ds2_model, speech_featurizer, text_featurizer) ctc_tester.run(test_dataset)
def main(): tf.keras.backend.clear_session() parser = argparse.ArgumentParser(prog="Deep Speech 2 Training") parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") parser.add_argument("--export", "-e", type=str, default=None, help="Path to the model file to be exported") parser.add_argument("--mixed_precision", type=bool, default=False, help="Whether to use mixed precision training") parser.add_argument("--save_weights", type=bool, default=False, help="Whether to save or load only weights") parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") parser.add_argument( "--eval_train_ratio", type=int, default=1, help="ratio between train batch size and eval batch size") parser.add_argument("--tfrecords", type=bool, default=False, help="Whether to use tfrecords dataset") args = parser.parse_args() config = UserConfig(DEFAULT_YAML, args.config, learning=True) speech_featurizer = TFSpeechFeaturizer(config["speech_config"]) text_featurizer = TextFeaturizer(config["decoder_config"]) tf.random.set_seed(2020) if args.mixed_precision: policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16") tf.keras.mixed_precision.experimental.set_policy(policy) print("Enabled mixed precision training") if args.tfrecords: train_dataset = ASRTFRecordDataset( config["learning_config"]["dataset_config"]["train_paths"], config["learning_config"]["dataset_config"]["tfrecords_dir"], speech_featurizer, text_featurizer, "train", augmentations=config["learning_config"]["augmentations"], shuffle=True, ) eval_dataset = ASRTFRecordDataset( config["learning_config"]["dataset_config"]["eval_paths"], config["learning_config"]["dataset_config"]["tfrecords_dir"], speech_featurizer, text_featurizer, "eval", shuffle=False) else: train_dataset = ASRSliceDataset( stage="train", speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, data_paths=config["learning_config"]["dataset_config"] ["eval_paths"], augmentations=config["learning_config"]["augmentations"], shuffle=True) eval_dataset = ASRSliceDataset(stage="train", speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, data_paths=config["learning_config"] ["dataset_config"]["eval_paths"], shuffle=True) ctc_trainer = CTCTrainer(speech_featurizer, text_featurizer, config["learning_config"]["running_config"], args.mixed_precision) # Build DS2 model f, c = speech_featurizer.compute_feature_dim() with ctc_trainer.strategy.scope(): ds2_model = DeepSpeech2(input_shape=[None, f, c], arch_config=config["model_config"], num_classes=text_featurizer.num_classes, name="deepspeech2") ds2_model._build([1, 50, f, c]) # Compile ctc_trainer.compile(ds2_model, config["learning_config"]["optimizer_config"], max_to_keep=args.max_ckpts) ctc_trainer.fit(train_dataset, eval_dataset, args.eval_train_ratio) if args.export: if args.save_weights: ctc_trainer.model.save_weights(args.export) else: ctc_trainer.model.save(args.export)
def main(): parser = argparse.ArgumentParser(prog="SelfAttentionDS2 Histogram") parser.add_argument("--config", type=str, default=None, help="Config file") parser.add_argument("--audio", type=str, default=None, help="Audio file") parser.add_argument("--saved_model", type=str, default=None, help="Saved model") parser.add_argument("--from_weights", type=bool, default=False, help="Load from weights") parser.add_argument("--output", type=str, default=None, help="Output dir storing histograms") args = parser.parse_args() config = UserConfig(args.config, args.config, learning=False) speech_featurizer = SpeechFeaturizer(config["speech_config"]) text_featurizer = CharFeaturizer(config["decoder_config"]) text_featurizer.add_scorer( Scorer(**text_featurizer.decoder_config["lm_config"], vocabulary=text_featurizer.vocab_array)) f, c = speech_featurizer.compute_feature_dim() satt_ds2_model = SelfAttentionDS2(input_shape=[None, f, c], arch_config=config["model_config"], num_classes=text_featurizer.num_classes) satt_ds2_model._build([1, 50, f, c]) if args.from_weights: satt_ds2_model.load_weights(args.saved_model) else: saved_model = tf.keras.models.load_model(args.saved_model) satt_ds2_model.set_weights(saved_model.get_weights()) satt_ds2_model.summary(line_length=100) satt_ds2_model.add_featurizers(speech_featurizer, text_featurizer) signal = read_raw_audio(args.audio, speech_featurizer.sample_rate) features = speech_featurizer.extract(signal) decoded = satt_ds2_model.recognize_beam(tf.expand_dims(features, 0), lm=True) print(bytes_to_string(decoded.numpy())) # for i in range(1, len(satt_ds2_model.base_model.layers)): # func = tf.keras.backend.function([satt_ds2_model.base_model.input], # [satt_ds2_model.base_model.layers[i].output]) # data = func([np.expand_dims(features, 0), 1])[0][0] # print(data.shape) # plt.figure(figsize=(16, 5)) # ax = plt.gca() # im = ax.imshow(data.T, origin="lower", aspect="auto") # ax.set_title(f"{satt_ds2_model.base_model.layers[i].name}", fontweight="bold") # divider = make_axes_locatable(ax) # cax = divider.append_axes("right", size="5%", pad=0.05) # plt.colorbar(im, cax=cax) # plt.savefig(os.path.join( # args.output, f"{i}_{satt_ds2_model.base_model.layers[i].name}.png")) # plt.clf() # plt.cla() # plt.close() fc = satt_ds2_model(tf.expand_dims(features, 0), training=False) plt.figure(figsize=(16, 5)) ax = plt.gca() ax.set_title(f"{satt_ds2_model.layers[-1].name}", fontweight="bold") im = ax.imshow(fc[0].numpy().T, origin="lower", aspect="auto") divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.05) plt.colorbar(im, cax=cax) plt.savefig( os.path.join(args.output, f"{satt_ds2_model.layers[-1].name}.png")) plt.clf() plt.cla() plt.close() fc = tf.nn.softmax(fc) plt.figure(figsize=(16, 5)) ax = plt.gca() ax.set_title("Softmax", fontweight="bold") im = ax.imshow(fc[0].numpy().T, origin="lower", aspect="auto") divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.05) plt.colorbar(im, cax=cax) plt.savefig(os.path.join(args.output, "softmax.png")) plt.clf() plt.cla() plt.close() plt.figure(figsize=(16, 5)) ax = plt.gca() ax.set_title("Log Mel Spectrogram", fontweight="bold") im = ax.imshow(features[:, :, 0].T, origin="lower", aspect="auto") divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.05) plt.colorbar(im, cax=cax) plt.savefig(os.path.join(args.output, "features.png")) plt.clf() plt.cla() plt.close()
def run(args): assert args.mode in modes, f"Mode must in {modes}" config = UserConfig(DEFAULT_YAML, args.config, learning=True) if args.mode == "train": tf.random.set_seed(2020) if args.mixed_precision: policy = tf.keras.mixed_precision.experimental.Policy( "mixed_float16") tf.keras.mixed_precision.experimental.set_policy(policy) print("Enabled mixed precision training") dataset = SeganDataset( "train", config["learning_config"]["dataset_config"]["train_paths"], config["learning_config"]["dataset_config"]["noise_config"], config["speech_config"], shuffle=True) segan_trainer = SeganTrainer( config["speech_config"], config["learning_config"]["running_config"], args.mixed_precision) segan_trainer.compile( config["model_config"], config["learning_config"]["optimizer_config"], max_to_keep=args.max_ckpts) segan_trainer.fit(train_dataset=dataset) if args.export: if args.from_weights: segan_trainer.generator.save_weights(args.export) else: segan_trainer.generator.save(args.export) elif args.mode == "test": tf.random.set_seed(0) assert args.export dataset = SeganDataset( "test", config["learning_config"]["dataset_config"]["test_paths"], config["learning_config"]["dataset_config"]["noise_config"], config["speech_config"], shuffle=False).create_test() segan_tester = SeganTester( config["speech_config"], config["learning_config"]["running_config"], args.export, from_weights=args.from_weights) segan_tester.compile(config["model_config"]) segan_tester.run(dataset) else: assert args.export segan_trainer = SeganTrainer( config["speech_config"], config["learning_config"]["running_config"], args.mixed_precision) segan_trainer.compile( config["model_config"], config["learning_config"]["optimizer_config"]) segan_trainer.load_checkpoint() if args.from_weights: segan_trainer.generator.save_weights(args.export) else: segan_trainer.generator.save(args.export)