def _gen_data(): for clean_wav_path in self.data_paths: noisy_wav_path = clean_wav_path.replace(self.clean_dir, self.noisy_dir) clean_wav = read_raw_audio(clean_wav_path, sample_rate=self.speech_featurizer.sample_rate) noisy_wav = read_raw_audio(noisy_wav_path, sample_rate=self.speech_featurizer.sample_rate) clean_slices, noisy_slices = self.parse(clean_wav, noisy_wav) for clean, noisy in zip(clean_slices, noisy_slices): yield clean, noisy
def _gen_data(): for clean_wav_path in self.data_paths: clean_wav = read_raw_audio( clean_wav_path, sample_rate=self.speech_config["sample_rate"]) noisy_wav_path = clean_wav_path.replace( self.clean_dir, self.noisy_dir) noisy_wav = read_raw_audio( noisy_wav_path, sample_rate=self.speech_config["sample_rate"]) clean_slices, noisy_slices = self.parse(clean_wav, noisy_wav) yield clean_wav_path, clean_slices, noisy_slices
def main( filename: str, tflite: str = None, blank: int = 0, num_rnns: int = 1, nstates: int = 2, statesize: int = 320, ): tflitemodel = tf.lite.Interpreter(model_path=tflite) signal = read_raw_audio(filename) input_details = tflitemodel.get_input_details() output_details = tflitemodel.get_output_details() tflitemodel.resize_tensor_input(input_details[0]["index"], signal.shape) tflitemodel.allocate_tensors() tflitemodel.set_tensor(input_details[0]["index"], signal) tflitemodel.set_tensor(input_details[1]["index"], tf.constant(blank, dtype=tf.int32)) tflitemodel.set_tensor( input_details[2]["index"], tf.zeros([num_rnns, nstates, 1, statesize], dtype=tf.float32)) tflitemodel.invoke() hyp = tflitemodel.get_tensor(output_details[0]["index"]) print("".join([chr(u) for u in hyp]))
def main(argv): speech_file = argv[1] feature_type = argv[2] speech_conf = { "sample_rate": 16000, "frame_ms": 25, "stride_ms": 10, "feature_type": feature_type, "preemphasis": 0.97, "normalize_signal": True, "normalize_feature": True, "normalize_per_feature": False, "num_feature_bins": 80, } signal = read_raw_audio(speech_file, speech_conf["sample_rate"]) nsf = NumpySpeechFeaturizer(speech_conf) sf = TFSpeechFeaturizer(speech_conf) ft = nsf.stft(signal) print(ft.shape, np.mean(ft)) ft = sf.stft(signal).numpy() print(ft.shape, np.mean(ft)) ft = sf.extract(signal) plt.figure(figsize=(16, 2.5)) ax = plt.gca() ax.set_title(f"{feature_type}", fontweight="bold") librosa.display.specshow(ft.T, cmap="magma") v1 = np.linspace(ft.min(), ft.max(), 8, endpoint=True) plt.colorbar(pad=0.01, fraction=0.02, ax=ax, format="%.2f", ticks=v1) plt.tight_layout()
def _gen_data(): for clean_wav_path in self.data_paths: clean_wav = read_raw_audio( clean_wav_path, sample_rate=self.speech_featurizer.sample_rate) clean_slices, noisy_slices = self.parse(clean_wav) for clean, noisy in zip(clean_slices, noisy_slices): yield clean, noisy
def main( saved_model: str = None, filename: str = None, ): tf.keras.backend.clear_session() module = tf.saved_model.load(export_dir=saved_model) signal = read_raw_audio(filename) transcript = module.pred(signal) print("Transcript: ", "".join([chr(u) for u in transcript]))
def preprocess(self, path, transcript): with tf.device("/CPU:0"): signal = read_raw_audio(path.decode("utf-8"), self.speech_featurizer.sample_rate) features = self.speech_featurizer.extract(signal) features = tf.convert_to_tensor(features, tf.float32) input_length = tf.cast(tf.shape(features)[0], tf.int32) label = self.text_featurizer.extract(transcript.decode("utf-8")) label = tf.convert_to_tensor(label, dtype=tf.int32) return path, features, input_length, label
def fn(_path: bytes, _audio: bytes, _indices: bytes): signal = read_raw_audio(_audio, sample_rate=self.speech_featurizer.sample_rate) signal = self.augmentations.signal_augment(signal) features = self.speech_featurizer.extract(signal.numpy()) features = self.augmentations.feature_augment(features) features = tf.convert_to_tensor(features, tf.float32) input_length = tf.cast(tf.shape(features)[0], tf.int32) label = tf.strings.to_number(tf.strings.split(_indices), out_type=tf.int32) label_length = tf.cast(tf.shape(label)[0], tf.int32) prediction = self.text_featurizer.prepand_blank(label) prediction_length = tf.cast(tf.shape(prediction)[0], tf.int32) return _path, features, input_length, label, label_length, prediction, prediction_length
def main(argv): speech_file = argv[1] feature_type = argv[2] augments = { # "after": { # "time_masking": { # "num_masks": 10, # "mask_factor": 100, # "p_upperbound": 0.05 # }, # "freq_masking": { # "mask_factor": 27 # } # }, } au = UserAugmentation(augments) speech_conf = { "sample_rate": 16000, "frame_ms": 25, "stride_ms": 10, "feature_type": feature_type, "preemphasis": 0.97, "normalize_signal": True, "normalize_feature": True, "normalize_per_feature": False, "num_feature_bins": 80, } signal = read_raw_audio(speech_file, speech_conf["sample_rate"]) sf = NumpySpeechFeaturizer(speech_conf) ft = sf.extract(signal) ft = au["after"].augment(ft)[:, :, 0] plt.figure(figsize=(16, 2.5)) ax = plt.gca() ax.set_title(f"{feature_type}", fontweight="bold") librosa.display.specshow(ft.T, cmap="magma") v1 = np.linspace(ft.min(), ft.max(), 8, endpoint=True) plt.colorbar(pad=0.01, fraction=0.02, ax=ax, format="%.2f", ticks=v1) plt.tight_layout() # plt.savefig(argv[3]) plt.show()
print("Loading subwords ...") text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) else: text_featurizer = CharFeaturizer(config.decoder_config) text_featurizer.decoder_config.beam_width = args.beam_width # build model conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer.make(speech_featurizer.shape) conformer.load_weights(args.saved, by_name=True, skip_mismatch=True) conformer.summary(line_length=120) conformer.add_featurizers(speech_featurizer, text_featurizer) signal = read_raw_audio(args.filename) features = speech_featurizer.tf_extract(signal) input_length = math_util.get_reduced_length( tf.shape(features)[0], conformer.time_reduction_factor) if args.beam_width: transcript = conformer.recognize_beam(features[None, ...], input_length[None, ...]) print("Transcript:", transcript[0].numpy().decode("UTF-8")) elif args.timestamp: transcript, stime, etime, _, _ = conformer.recognize_tflite_with_timestamp( signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state()) print("Transcript:", transcript) print("Start time:", stime) print("End time:", etime)
def main(): parser = argparse.ArgumentParser(prog="SelfAttentionDS2 Histogram") parser.add_argument("--config", type=str, default=None, help="Config file") parser.add_argument("--audio", type=str, default=None, help="Audio file") parser.add_argument("--saved_model", type=str, default=None, help="Saved model") parser.add_argument("--from_weights", type=bool, default=False, help="Load from weights") parser.add_argument("--output", type=str, default=None, help="Output dir storing histograms") args = parser.parse_args() config = UserConfig(args.config, args.config, learning=False) speech_featurizer = SpeechFeaturizer(config["speech_config"]) text_featurizer = CharFeaturizer(config["decoder_config"]) text_featurizer.add_scorer(Scorer(**text_featurizer.decoder_config["lm_config"], vocabulary=text_featurizer.vocab_array)) f, c = speech_featurizer.compute_feature_dim() satt_ds2_model = SelfAttentionDS2(input_shape=[None, f, c], arch_config=config["model_config"], num_classes=text_featurizer.num_classes) satt_ds2_model._build([1, 50, f, c]) if args.from_weights: satt_ds2_model.load_weights(args.saved_model) else: saved_model = tf.keras.models.load_model(args.saved_model) satt_ds2_model.set_weights(saved_model.get_weights()) satt_ds2_model.summary(line_length=100) satt_ds2_model.add_featurizers(speech_featurizer, text_featurizer) signal = read_raw_audio(args.audio, speech_featurizer.sample_rate) features = speech_featurizer.extract(signal) decoded = satt_ds2_model.recognize_beam(tf.expand_dims(features, 0), lm=True) print(bytes_to_string(decoded.numpy())) for i in range(1, len(satt_ds2_model.base_model.layers)): func = tf.keras.backend.function([satt_ds2_model.base_model.input], [satt_ds2_model.base_model.layers[i].output]) data = func([np.expand_dims(features, 0), 1])[0][0] print(data.shape) data = data.flatten() plt.hist(data, 200, color='green', histtype="stepfilled") plt.title(f"Output of {satt_ds2_model.base_model.layers[i].name}", fontweight="bold") plt.savefig(os.path.join( args.output, f"{i}_{satt_ds2_model.base_model.layers[i].name}.png")) plt.clf() plt.cla() plt.close() fc = satt_ds2_model(tf.expand_dims(features, 0), training=False) plt.hist(fc[0].numpy().flatten(), 200, color="green", histtype="stepfilled") plt.title(f"Output of {satt_ds2_model.layers[-1].name}", fontweight="bold") plt.savefig(os.path.join(args.output, f"{satt_ds2_model.layers[-1].name}.png")) plt.clf() plt.cla() plt.close() fc = tf.nn.softmax(fc) plt.hist(fc[0].numpy().flatten(), 10, color="green", histtype="stepfilled") plt.title("Output of softmax", fontweight="bold") plt.savefig(os.path.join(args.output, "softmax_hist.png")) plt.clf() plt.cla() plt.close() plt.hist(features.flatten(), 200, color="green", histtype="stepfilled") plt.title("Log Mel Spectrogram", fontweight="bold") plt.savefig(os.path.join(args.output, "log_mel_spectrogram.png")) plt.clf() plt.cla() plt.close()
model.add_featurizers( speech_featurizer=speech_featurizer, text_featurizer=text_featurizer ) # features = tf.zeros(shape=[5, 50, 80, 1], dtype=tf.float32) # pred = model.recognize(features) # print(pred) # pred = model.recognize_beam(features) # print(pred) # stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # logdir = '/tmp/logs/func/%s' % stamp # writer = tf.summary.create_file_writer(logdir) # signal = read_raw_audio(sys.argv[1], speech_featurizer.sample_rate) # # tf.summary.trace_on(graph=True, profiler=True) # hyps = model.recognize_tflite(signal, 0, tf.zeros([1, 2, 1, 320], dtype=tf.float32)) # with writer.as_default(): # tf.summary.trace_export( # name="recognize_tflite", # step=0, # profiler_outdir=logdir) # # print(hyps[0]) # # # hyps = model.recognize_beam(features) # #
return extract_from_mel(features) @tf.function(input_signature=[ tf.TensorSpec(shape=[None, speech_config['n_mels'], 1], dtype=tf.float32, name="signal") ]) def extract_from_mel(features): with tf.device('/cpu:0'): encoded = conformer.encoder_inference(features) return encoded suffix = '.wav' mel_query = '_mel.npy' feature_query = '_conformer_enc16.npy' audio_files = sorted(find_files(args.dataset, '*' + suffix)) print('files:', len(audio_files), audio_files[0]) for filename in tqdm(audio_files): mel = filename.replace(suffix, mel_query) if os.path.exists(mel): features = np.load(mel).reshape([-1, speech_config['n_mels'], 1]) encoded = extract_from_mel(features) else: signal = read_raw_audio(filename) encoded = extract_from_audio(signal) np.save(filename.replace(suffix, feature_query), encoded)
def main(): parser = argparse.ArgumentParser(prog="SelfAttentionDS2 Histogram") parser.add_argument("--config", type=str, default=None, help="Config file") parser.add_argument("--audio", type=str, default=None, help="Audio file") parser.add_argument("--saved_model", type=str, default=None, help="Saved model") parser.add_argument("--from_weights", type=bool, default=False, help="Load from weights") parser.add_argument("--output", type=str, default=None, help="Output dir storing histograms") args = parser.parse_args() config = UserConfig(args.config, args.config, learning=False) speech_featurizer = SpeechFeaturizer(config["speech_config"]) text_featurizer = CharFeaturizer(config["decoder_config"]) text_featurizer.add_scorer( Scorer(**text_featurizer.decoder_config["lm_config"], vocabulary=text_featurizer.vocab_array)) f, c = speech_featurizer.compute_feature_dim() satt_ds2_model = SelfAttentionDS2(input_shape=[None, f, c], arch_config=config["model_config"], num_classes=text_featurizer.num_classes) satt_ds2_model._build([1, 50, f, c]) if args.from_weights: satt_ds2_model.load_weights(args.saved_model) else: saved_model = tf.keras.models.load_model(args.saved_model) satt_ds2_model.set_weights(saved_model.get_weights()) satt_ds2_model.summary(line_length=100) satt_ds2_model.add_featurizers(speech_featurizer, text_featurizer) signal = read_raw_audio(args.audio, speech_featurizer.sample_rate) features = speech_featurizer.extract(signal) decoded = satt_ds2_model.recognize_beam(tf.expand_dims(features, 0), lm=True) print(bytes_to_string(decoded.numpy())) # for i in range(1, len(satt_ds2_model.base_model.layers)): # func = tf.keras.backend.function([satt_ds2_model.base_model.input], # [satt_ds2_model.base_model.layers[i].output]) # data = func([np.expand_dims(features, 0), 1])[0][0] # print(data.shape) # plt.figure(figsize=(16, 5)) # ax = plt.gca() # im = ax.imshow(data.T, origin="lower", aspect="auto") # ax.set_title(f"{satt_ds2_model.base_model.layers[i].name}", fontweight="bold") # divider = make_axes_locatable(ax) # cax = divider.append_axes("right", size="5%", pad=0.05) # plt.colorbar(im, cax=cax) # plt.savefig(os.path.join( # args.output, f"{i}_{satt_ds2_model.base_model.layers[i].name}.png")) # plt.clf() # plt.cla() # plt.close() fc = satt_ds2_model(tf.expand_dims(features, 0), training=False) plt.figure(figsize=(16, 5)) ax = plt.gca() ax.set_title(f"{satt_ds2_model.layers[-1].name}", fontweight="bold") im = ax.imshow(fc[0].numpy().T, origin="lower", aspect="auto") divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.05) plt.colorbar(im, cax=cax) plt.savefig( os.path.join(args.output, f"{satt_ds2_model.layers[-1].name}.png")) plt.clf() plt.cla() plt.close() fc = tf.nn.softmax(fc) plt.figure(figsize=(16, 5)) ax = plt.gca() ax.set_title("Softmax", fontweight="bold") im = ax.imshow(fc[0].numpy().T, origin="lower", aspect="auto") divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.05) plt.colorbar(im, cax=cax) plt.savefig(os.path.join(args.output, "softmax.png")) plt.clf() plt.cla() plt.close() plt.figure(figsize=(16, 5)) ax = plt.gca() ax.set_title("Log Mel Spectrogram", fontweight="bold") im = ax.imshow(features[:, :, 0].T, origin="lower", aspect="auto") divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.05) plt.colorbar(im, cax=cax) plt.savefig(os.path.join(args.output, "features.png")) plt.clf() plt.cla() plt.close()
import librosa import matplotlib.pyplot as plt import librosa.display import numpy as np import soundfile def visual(title, audio, sample_rate): plt.figure(figsize=(8, 4)) librosa.display.waveplot(audio, sr=sample_rate) plt.title(title) plt.tight_layout() #plt.show() plt.savefig("./audio_b.jpg") config_dir = "tests/config_aishell.yml" config = Config(config_dir, learning=True) aug = config.learning_config.augmentations sampling_rate = 16000 audio = '/tsdata/ASR/aishell-1//wav/train/S0002/BAC009S0002W0123.wav' signal = read_raw_audio(audio, sampling_rate) #visual('Original', signal, sampling_rate) signal = aug.before.augment(signal) visual('Original', signal, sampling_rate) soundfile.write('./test_d.wav', signal, 16000)