def build_am(self, config_path, model_path): config = Config(config_path, learning=False) conformer = Conformer(**config.model_config, vocabulary_size=1031) conformer._build(self.speech_featurizer.shape) print('loading am...') conformer.load_weights(model_path, by_name=True) return conformer
assert args.saved if args.tfrecords: test_dataset = ASRTFRecordDataset( data_paths=config.learning_config.dataset_config.test_paths, tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, stage="test", shuffle=False) else: test_dataset = ASRSliceDataset( data_paths=config.learning_config.dataset_config.test_paths, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, stage="test", shuffle=False) # build model conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer._build(speech_featurizer.shape) conformer.load_weights(args.saved, by_name=True) conformer.summary(line_length=120) conformer.add_featurizers(speech_featurizer, text_featurizer) conformer_tester = BaseTester(config=config.learning_config.running_config, output_name=args.output_name) conformer_tester.compile(conformer) conformer_tester.run(test_dataset)
config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) text_featurizer = CharFeaturizer(config.decoder_config) tf.random.set_seed(0) assert args.saved if args.tfrecords: test_dataset = ASRTFRecordDataset( speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, **vars(config.learning_config.test_dataset_config)) else: test_dataset = ASRSliceDataset( speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, **vars(config.learning_config.test_dataset_config)) # build model conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer._build(speech_featurizer.shape) conformer.load_weights(args.saved) conformer.summary(line_length=120) conformer.add_featurizers(speech_featurizer, text_featurizer) conformer_tester = BaseTester(config=config.learning_config.running_config, output_name=args.output_name) conformer_tester.compile(conformer) conformer_tester.run(test_dataset)
class ConformerTamilASR(object): """ Conformer S based ASR model """ def __init__(self, path='ConformerS.h5'): # fetch and load the config of the model config = Config('tamil_tech/configs/conformer_new_config.yml', learning=True) # load speech and text featurizers speech_featurizer = TFSpeechFeaturizer(config.speech_config) text_featurizer = CharFeaturizer(config.decoder_config) # check if model already exists in given path, else download the model in the given path if os.path.exists(path): pass else: print("Downloading Model...") file_id = config.file_id download_file_from_google_drive(file_id, path) print("Downloaded Model Successfully...") # load model using config self.model = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) # set shape of the featurizer and build the model self.model._build(speech_featurizer.shape) # load weights of the model self.model.load_weights(path, by_name=True) # display model summary self.model.summary(line_length=120) # set featurizers for the model self.model.add_featurizers(speech_featurizer, text_featurizer) print("Loaded Model...!") def read_raw_audio(self, audio, sample_rate=16000): # if audio path is given, load audio using librosa if isinstance(audio, str): wave, _ = librosa.load(os.path.expanduser(audio), sr=sample_rate) # if audio file is in bytes, use soundfile to read audio elif isinstance(audio, bytes): wave, sr = sf.read(io.BytesIO(audio)) # if audio is stereo, convert it to mono try: if wave.shape[1] >= 2: wave = np.transpose(wave)[0][:] except: pass # get loaded audio as numpy array wave = np.asfortranarray(wave) # resampel to 16000 kHz if sr != sample_rate: wave = librosa.resample(wave, sr, sample_rate) # if numpy array, return audio elif isinstance(audio, np.ndarray): return audio else: raise ValueError("input audio must be either a path or bytes") return wave def bytes_to_string(self, array: np.ndarray, encoding: str = "utf-8"): # decode text array with utf-8 encoding return [transcript.decode(encoding) for transcript in array] def infer(self, path, greedy=True, return_text=False): # read the audio signal = self.read_raw_audio(path) # expand dims to process for a single prediction signal = tf.expand_dims(self.model.speech_featurizer.tf_extract(signal), axis=0) # predict greedy if greedy: pred = self.model.recognize(features=signal) else: # preidct using beam search and language model pred = self.model.recognize_beam(features=signal, lm=True) if return_text: # return predicted transcription return self.bytes_to_string(pred.numpy())[0] # return predicted transcription print(self.bytes_to_string(pred.numpy())[0], end=' ')
strategy=strategy) with conformer_trainer.strategy.scope(): # build model if args.pretrained_model is None: print("Training from scratch...") conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer._build(speech_featurizer.shape) conformer.summary(line_length=120) else: print("Training from provided checkpoint...") conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer._build(speech_featurizer.shape) conformer.load_weights(args.pretrained_model) conformer.summary(line_length=120) conformer.add_featurizers(speech_featurizer, text_featurizer) # TODO: Do we need this? optimizer = tf.keras.optimizers.Adam( TransformerSchedule(d_model=conformer.dmodel, warmup_steps=config.learning_config. optimizer_config["warmup_steps"], max_lr=(0.05 / math.sqrt(conformer.dmodel))), beta_1=config.learning_config.optimizer_config["beta1"], beta_2=config.learning_config.optimizer_config["beta2"], epsilon=config.learning_config.optimizer_config["epsilon"]) conformer_trainer.compile(model=conformer, optimizer=optimizer,