def classifytext(self, queries: List[str], batch_size: int = 1, prompt: str = 'Sentiment') -> List[int]: """ Get prediction for the queries Args: queries: text sequences batch_size: batch size to use during inference prompt: the prompt string appended at the end of your input sentence Returns: all_preds: model predictions """ # store predictions for all queries in a single list all_preds = [] mode = self.training try: # Switch model to evaluation mode self.eval() logging_level = logging.get_verbosity() logging.set_verbosity(logging.WARNING) dataloader_cfg = {"batch_size": batch_size, "num_workers": 3, "pin_memory": False} infer_datalayer = self._setup_infer_dataloader(dataloader_cfg, queries, prompt) for i, batch in enumerate(infer_datalayer): sentences, _ = batch preds = self.forward_eval(sentences) all_preds.extend([self.id_to_label[i.item()] for i in preds]) finally: # set mode back to its original value self.train(mode=mode) logging.set_verbosity(logging_level) return all_preds
def transcribe(self, paths2audio_files: List[str], batch_size: int = 4) -> List[str]: """ Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. Args: paths2audio_files: (a list) of paths to audio files. \ Recommended length per file is between 5 and 25 seconds. \ But it is possible to pass a few hours long file if enough GPU memory is available. batch_size: (int) batch size to use during inference. \ Bigger will result in better throughput performance but would use more memory. Returns: A list of transcriptions in the same order as paths2audio_files """ if paths2audio_files is None or len(paths2audio_files) == 0: return {} # We will store transcriptions here hypotheses = [] # Model's mode and device mode = self.training device = next(self.parameters()).device try: # Switch model to evaluation mode self.eval() logging_level = logging.get_verbosity() logging.set_verbosity(logging.WARNING) # Work in tmp directory - will store manifest file there with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, 'manifest.json'), 'w') as fp: for audio_file in paths2audio_files: entry = { 'audio_filepath': audio_file, 'duration': 100000, 'text': 'nothing' } fp.write(json.dumps(entry) + '\n') config = { 'paths2audio_files': paths2audio_files, 'batch_size': batch_size, 'temp_dir': tmpdir } temporary_datalayer = self._setup_transcribe_dataloader(config) for test_batch in temporary_datalayer: encoded, encoded_len = self.forward( input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)) hypotheses += self.decoding.rnnt_decoder_predictions_tensor( encoded, encoded_len) del test_batch finally: # set mode back to its original value self.train(mode=mode) logging.set_verbosity(logging_level) return hypotheses
def test_SpectrogramAugmentationr_numba_kernel(self, caplog): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) logging._logger.propagate = True original_verbosity = logging.get_verbosity() logging.set_verbosity(logging.DEBUG) caplog.set_level(logging.DEBUG) # Make sure constructor works instance1 = modules.SpectrogramAugmentation( freq_masks=10, time_masks=3, rect_masks=3, use_numba_spec_augment=True ) assert isinstance(instance1, modules.SpectrogramAugmentation) # Make sure forward doesn't throw with expected input instance0 = modules.AudioToMelSpectrogramPreprocessor(dither=0) input_signal = torch.randn(size=(8, 512)) length = torch.randint(low=161, high=500, size=[8]) res0 = instance0(input_signal=input_signal, length=length) res = instance1(input_spec=res0[0], length=length) assert res.shape == res0[0].shape # check tha numba kernel debug message indicates that it is available for use assert """Numba SpecAugment kernel is available""" in caplog.text logging._logger.propagate = False logging.set_verbosity(original_verbosity)
def ptune_inference(self, queries: List[Dict], batch_size: int = 1, decode_token_len: int = 5) -> List[str]: """ Get prediction for the queries Args: queries: List of data samples without labels batch_size: batch size to use during inference decode_token_len: max number of tokens to generate during inference Returns: all_preds: model predictions """ # store predictions for all queries in a single list all_preds = [] mode = self.training try: # Switch model to evaluation mode self.eval() logging_level = logging.get_verbosity() logging.set_verbosity(logging.WARNING) dataloader_cfg = { "batch_size": batch_size, "num_workers": 3, "pin_memory": False } infer_datalayer = self._setup_infer_dataloader( dataloader_cfg, queries, decode_token_len) for i, batch in enumerate(infer_datalayer): enc_query = batch['enc_query'].to(self.device) label_position = batch['label_position'].to(self.device) enc_taskname = batch['enc_taskname'].to(self.device) # loss, tokens_enc, labels, enc_mask, encoder_input = self.get_loss(batch) predicted_token_ids, _ = self.decode( enc_query=enc_query, enc_taskname=enc_taskname, label_position=label_position, num_tokens_to_generate=self.num_tokens_to_gen, ) preds = predicted_token_ids.cpu().numpy().tolist() label_positions = label_position.cpu().numpy().tolist() for i, (pred, label_position) in enumerate( zip(preds, label_positions)): start_position = label_position[0] + 1 pred = pred[start_position:] if self.tokenizer.eos_id in pred: idx = pred.index(self.tokenizer.eos_id) pred = pred[:idx] pred = [id for id in pred if id not in self.special_tokens] pred = self.tokenizer.ids_to_text(pred) all_preds.append(pred) finally: # set mode back to its original value self.train(mode=mode) logging.set_verbosity(logging_level) return all_preds
def classifytext(self, queries: List[str], batch_size: int = 1, max_seq_length: int = -1) -> List[int]: """ Get prediction for the queries Args: queries: text sequences batch_size: batch size to use during inference max_seq_length: sequences longer than max_seq_length will get truncated. default -1 disables truncation. Returns: all_preds: model predictions """ # store predictions for all queries in a single list all_preds = [] mode = self.training device = next(self.parameters()).device try: # Switch model to evaluation mode self.eval() logging_level = logging.get_verbosity() logging.set_verbosity(logging.WARNING) dataloader_cfg = { "batch_size": batch_size, "num_workers": 3, "pin_memory": False } infer_datalayer = self._setup_infer_dataloader( dataloader_cfg, queries, max_seq_length) for i, batch in enumerate(infer_datalayer): input_ids, input_type_ids, input_mask, subtokens_mask = batch logits = self.forward( input_ids=input_ids.to(device), token_type_ids=input_type_ids.to(device), attention_mask=input_mask.to(device), ) preds = tensor2list(torch.argmax(logits, axis=-1)) all_preds.extend(preds) finally: # set mode back to its original value self.train(mode=mode) logging.set_verbosity(logging_level) return all_preds
def test_base_model_no_support_for_adapters(self, caplog): logging._logger.propagate = True original_verbosity = logging.get_verbosity() logging.set_verbosity(logging.WARNING) caplog.set_level(logging.WARNING) cfg = get_model_config(in_features=50, update_adapter_cfg=False) model = DefaultAdapterModel(cfg) with pytest.raises(AttributeError): model.add_adapter(name='adapter_0', cfg=get_adapter_cfg()) # check that warning message indicates that it module is not available assert """Encoder does not support adapters !""" in caplog.text caplog.clear() model.get_enabled_adapters() # check that there is not warning message, since it should log only once. assert """Encoder does not support adapters !""" not in caplog.text logging._logger.propagate = False logging.set_verbosity(original_verbosity)
def transcribe(self, paths2audio_files: List[str], batch_size: int = 4, logprobs=False) -> List[str]: """ Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. Args: paths2audio_files: (a list) of paths to audio files. \ Recommended length per file is between 5 and 25 seconds. \ But it is possible to pass a few hours long file if enough GPU memory is available. batch_size: (int) batch size to use during inference. \ Bigger will result in better throughput performance but would use more memory. logprobs: (bool) pass True to get log probabilities instead of transcripts. Returns: A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files """ if paths2audio_files is None or len(paths2audio_files) == 0: return {} # We will store transcriptions here hypotheses = [] # Model's mode and device mode = self.training device = next(self.parameters()).device dither_value = self.preprocessor.featurizer.dither pad_to_value = self.preprocessor.featurizer.pad_to try: self.preprocessor.featurizer.dither = 0.0 self.preprocessor.featurizer.pad_to = 0 # Switch model to evaluation mode self.eval() logging_level = logging.get_verbosity() logging.set_verbosity(logging.WARNING) # Work in tmp directory - will store manifest file there with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, 'manifest.json'), 'w') as fp: for audio_file in paths2audio_files: entry = { 'audio_filepath': audio_file, 'duration': 100000, 'text': 'nothing' } fp.write(json.dumps(entry) + '\n') config = { 'paths2audio_files': paths2audio_files, 'batch_size': batch_size, 'temp_dir': tmpdir } temporary_datalayer = self._setup_transcribe_dataloader(config) for test_batch in temporary_datalayer: logits, logits_len, greedy_predictions = self.forward( input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)) if logprobs: # dump log probs per file for idx in range(logits.shape[0]): hypotheses.append(logits[idx][:logits_len[idx]]) else: hypotheses += self._wer.ctc_decoder_predictions_tensor( greedy_predictions, predictions_len=logits_len) del test_batch finally: # set mode back to its original value self.train(mode=mode) self.preprocessor.featurizer.dither = dither_value self.preprocessor.featurizer.pad_to = pad_to_value logging.set_verbosity(logging_level) return hypotheses
def transcribe(self, paths2audio_files: List[str], batch_size: int = 4, logprobs=False) -> List[str]: """ Generate class labels for provided audio files. Use this method for debugging and prototyping. Args: paths2audio_files: (a list) of paths to audio files. \ Recommended length per file is approximately 1 second. batch_size: (int) batch size to use during inference. \ Bigger will result in better throughput performance but would use more memory. logprobs: (bool) pass True to get log probabilities instead of class labels. Returns: A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files """ if paths2audio_files is None or len(paths2audio_files) == 0: return [] # We will store transcriptions here labels = [] # Model's mode and device mode = self.training device = next(self.parameters()).device dither_value = self.preprocessor.featurizer.dither pad_to_value = self.preprocessor.featurizer.pad_to try: self.preprocessor.featurizer.dither = 0.0 self.preprocessor.featurizer.pad_to = 0 # Switch model to evaluation mode self.eval() logging_level = logging.get_verbosity() logging.set_verbosity(logging.WARNING) # Work in tmp directory - will store manifest file there with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, 'manifest.json'), 'w') as fp: for audio_file in paths2audio_files: label = 0.0 if self.is_regression_task else self.cfg.labels[ 0] entry = { 'audio_filepath': audio_file, 'duration': 100000.0, 'label': label } fp.write(json.dumps(entry) + '\n') config = { 'paths2audio_files': paths2audio_files, 'batch_size': batch_size, 'temp_dir': tmpdir } temporary_datalayer = self._setup_transcribe_dataloader(config) for test_batch in temporary_datalayer: logits = self.forward( input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)) if logprobs: # dump log probs per file for idx in range(logits.shape[0]): lg = logits[idx] labels.append(lg.cpu().numpy()) else: labels_k = [] top_ks = self._accuracy.top_k for top_k_i in top_ks: # replace top k value with current top k self._accuracy.top_k = top_k_i labels_k_i = self._accuracy.top_k_predicted_labels( logits) labels_k.append(labels_k_i) # convenience: if only one top_k, pop out the nested list if len(top_ks) == 1: labels_k = labels_k[0] labels += labels_k # reset top k to orignal value self._accuracy.top_k = top_ks del test_batch finally: # set mode back to its original value self.train(mode=mode) self.preprocessor.featurizer.dither = dither_value self.preprocessor.featurizer.pad_to = pad_to_value logging.set_verbosity(logging_level) return labels
def inference( self, file: str, batch_size: int = 1, num_samples: int = -1, output_nbest_file: Optional[str] = None, output_prediction_file: Optional[str] = None, ): """ Get prediction for unlabeled inference data Args: file: inference data batch_size: batch size to use during inference num_samples: number of samples to use of inference data. Default: -1 if all data should be used. output_nbest_file: optional output file for writing out nbest list output_prediction_file: optional output file for writing out predictions Returns: model predictions, model nbest list """ # store predictions for all queries in a single list all_predictions = [] all_nbest = [] mode = self.training device = 'cuda' if torch.cuda.is_available() else 'cpu' try: # Switch model to evaluation mode self.eval() self.to(device) logging_level = logging.get_verbosity() logging.set_verbosity(logging.WARNING) dataloader_cfg = { "batch_size": batch_size, "file": file, "shuffle": False, "num_samples": num_samples, 'num_workers': 2, 'pin_memory': False, 'drop_last': False, } dataloader_cfg = OmegaConf.create(dataloader_cfg) infer_datalayer = self._setup_dataloader_from_config( cfg=dataloader_cfg, mode=INFERENCE_MODE) all_logits = [] all_unique_ids = [] for i, batch in enumerate(infer_datalayer): input_ids, token_type_ids, attention_mask, unique_ids = batch logits = self.forward( input_ids=input_ids.to(device), token_type_ids=token_type_ids.to(device), attention_mask=attention_mask.to(device), ) all_logits.append(logits) all_unique_ids.append(unique_ids) logits = torch.cat(all_logits) unique_ids = tensor2list(torch.cat(all_unique_ids)) s, e = logits.split(dim=-1, split_size=1) start_logits = tensor2list(s.squeeze(-1)) end_logits = tensor2list(e.squeeze(-1)) (all_predictions, all_nbest, scores_diff) = infer_datalayer.dataset.get_predictions( unique_ids=unique_ids, start_logits=start_logits, end_logits=end_logits, n_best_size=self._cfg.dataset.n_best_size, max_answer_length=self._cfg.dataset.max_answer_length, version_2_with_negative=self._cfg.dataset. version_2_with_negative, null_score_diff_threshold=self._cfg.dataset. null_score_diff_threshold, do_lower_case=self._cfg.dataset.do_lower_case, ) with open(file, 'r') as test_file_fp: test_data = json.load(test_file_fp)["data"] id_to_question_mapping = {} for title in test_data: for par in title["paragraphs"]: for question in par["qas"]: id_to_question_mapping[ question["id"]] = question["question"] for question_id in all_predictions: all_predictions[question_id] = ( id_to_question_mapping[question_id], all_predictions[question_id]) if output_nbest_file is not None: with open(output_nbest_file, "w") as writer: writer.write(json.dumps(all_nbest, indent=4) + "\n") if output_prediction_file is not None: with open(output_prediction_file, "w") as writer: writer.write(json.dumps(all_predictions, indent=4) + "\n") finally: # set mode back to its original value self.train(mode=mode) logging.set_verbosity(logging_level) return all_predictions, all_nbest
def main(): parser = ArgumentParser() parser.add_argument( "--asr_model", type=str, default="QuartzNet15x5Base-En", choices=[ x.pretrained_model_name for x in EncDecCTCModel.list_available_models() ], ) parser.add_argument( "--tts_model_spec", type=str, default="Tacotron2-22050Hz", choices=[ x.pretrained_model_name for x in SpectrogramGenerator.list_available_models() ], ) parser.add_argument( "--tts_model_vocoder", type=str, default="WaveGlow-22050Hz", choices=[ x.pretrained_model_name for x in Vocoder.list_available_models() ], ) parser.add_argument("--wer_tolerance", type=float, default=1.0, help="used by test") parser.add_argument("--trim", action="store_true") parser.add_argument("--debug", action="store_true") args = parser.parse_args() torch.set_grad_enabled(False) if args.debug: logging.set_verbosity(logging.DEBUG) logging.info(f"Using NGC cloud ASR model {args.asr_model}") asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model) logging.info( f"Using NGC cloud TTS Spectrogram Generator model {args.tts_model_spec}" ) tts_model_spec = SpectrogramGenerator.from_pretrained( model_name=args.tts_model_spec) logging.info(f"Using NGC cloud TTS Vocoder model {args.tts_model_vocoder}") tts_model_vocoder = Vocoder.from_pretrained( model_name=args.tts_model_vocoder) models = [asr_model, tts_model_spec, tts_model_vocoder] if torch.cuda.is_available(): for i, m in enumerate(models): models[i] = m.cuda() for m in models: m.eval() asr_model, tts_model_spec, tts_model_vocoder = models parser = parsers.make_parser( labels=asr_model.decoder.vocabulary, name="en", unk_id=-1, blank_id=-1, do_normalize=True, ) labels_map = dict([(i, asr_model.decoder.vocabulary[i]) for i in range(len(asr_model.decoder.vocabulary))]) tts_input = [] asr_references = [] longest_tts_input = 0 for test_str in LIST_OF_TEST_STRINGS: tts_parsed_input = tts_model_spec.parse(test_str) if len(tts_parsed_input[0]) > longest_tts_input: longest_tts_input = len(tts_parsed_input[0]) tts_input.append(tts_parsed_input.squeeze()) asr_parsed = parser(test_str) asr_parsed = ''.join([labels_map[c] for c in asr_parsed]) asr_references.append(asr_parsed) # Pad TTS Inputs for i, text in enumerate(tts_input): pad = (0, longest_tts_input - len(text)) tts_input[i] = torch.nn.functional.pad(text, pad, value=68) logging.debug(tts_input) # Do TTS tts_input = torch.stack(tts_input) if torch.cuda.is_available(): tts_input = tts_input.cuda() specs = tts_model_spec.generate_spectrogram(tokens=tts_input) audio = [] step = ceil(len(specs) / 4) for i in range(4): audio.append( tts_model_vocoder.convert_spectrogram_to_audio( spec=specs[i * step:i * step + step])) audio = [item for sublist in audio for item in sublist] audio_file_paths = [] # Save audio logging.debug(f"args.trim: {args.trim}") for i, aud in enumerate(audio): aud = aud.cpu().numpy() if args.trim: aud = librosa.effects.trim(aud, top_db=40)[0] librosa.output.write_wav(f"{i}.wav", aud, sr=22050) audio_file_paths.append(str(Path(f"{i}.wav"))) # Do ASR hypotheses = asr_model.transcribe(audio_file_paths) for i, _ in enumerate(hypotheses): logging.debug(f"{i}") logging.debug(f"ref:'{asr_references[i]}'") logging.debug(f"hyp:'{hypotheses[i]}'") wer_value = word_error_rate(hypotheses=hypotheses, references=asr_references) if wer_value > args.wer_tolerance: raise ValueError( f"Got WER of {wer_value}. It was higher than {args.wer_tolerance}") logging.info(f'Got WER of {wer_value}. Tolerance was {args.wer_tolerance}')
def transcribe( self, paths2audio_files: List[str], batch_size: int = 4, logprobs: bool = False, return_hypotheses: bool = False, num_workers: int = 0, ) -> List[str]: """ Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. Args: paths2audio_files: (a list) of paths to audio files. \ Recommended length per file is between 5 and 25 seconds. \ But it is possible to pass a few hours long file if enough GPU memory is available. batch_size: (int) batch size to use during inference. Bigger will result in better throughput performance but would use more memory. logprobs: (bool) pass True to get log probabilities instead of transcripts. return_hypotheses: (bool) Either return hypotheses or text With hypotheses can do some postprocessing like getting timestamp or rescoring num_workers: (int) number of workers for DataLoader Returns: A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files """ if paths2audio_files is None or len(paths2audio_files) == 0: return {} if return_hypotheses and logprobs: raise ValueError( "Either `return_hypotheses` or `logprobs` can be True at any given time." "Returned hypotheses will contain the logprobs." ) if num_workers is None: num_workers = min(batch_size, os.cpu_count() - 1) # We will store transcriptions here hypotheses = [] # Model's mode and device mode = self.training device = next(self.parameters()).device dither_value = self.preprocessor.featurizer.dither pad_to_value = self.preprocessor.featurizer.pad_to try: self.preprocessor.featurizer.dither = 0.0 self.preprocessor.featurizer.pad_to = 0 # Switch model to evaluation mode self.eval() # Freeze the encoder and decoder modules self.encoder.freeze() self.decoder.freeze() logging_level = logging.get_verbosity() logging.set_verbosity(logging.WARNING) # Work in tmp directory - will store manifest file there with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp: for audio_file in paths2audio_files: entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': ''} fp.write(json.dumps(entry) + '\n') config = { 'paths2audio_files': paths2audio_files, 'batch_size': batch_size, 'temp_dir': tmpdir, 'num_workers': num_workers, } temporary_datalayer = self._setup_transcribe_dataloader(config) for test_batch in tqdm(temporary_datalayer, desc="Transcribing"): logits, logits_len, greedy_predictions = self.forward( input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) ) if logprobs: # dump log probs per file for idx in range(logits.shape[0]): lg = logits[idx][: logits_len[idx]] hypotheses.append(lg.cpu().numpy()) else: current_hypotheses = self._wer.ctc_decoder_predictions_tensor( greedy_predictions, predictions_len=logits_len, return_hypotheses=return_hypotheses, ) if return_hypotheses: # dump log probs per file for idx in range(logits.shape[0]): current_hypotheses[idx].y_sequence = logits[idx][: logits_len[idx]] hypotheses += current_hypotheses del greedy_predictions del logits del test_batch finally: # set mode back to its original value self.train(mode=mode) self.preprocessor.featurizer.dither = dither_value self.preprocessor.featurizer.pad_to = pad_to_value if mode is True: self.encoder.unfreeze() self.decoder.unfreeze() logging.set_verbosity(logging_level) return hypotheses
def transcribe( self, paths2audio_files: List[str], batch_size: int = 4, return_hypotheses: bool = False, partial_hypothesis: Optional[List['Hypothesis']] = None, num_workers: int = 0, ) -> (List[str], Optional[List['Hypothesis']]): """ Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. Args: paths2audio_files: (a list) of paths to audio files. \ Recommended length per file is between 5 and 25 seconds. \ But it is possible to pass a few hours long file if enough GPU memory is available. batch_size: (int) batch size to use during inference. \ Bigger will result in better throughput performance but would use more memory. return_hypotheses: (bool) Either return hypotheses or text With hypotheses can do some postprocessing like getting timestamp or rescoring num_workers: (int) number of workers for DataLoader Returns: A list of transcriptions in the same order as paths2audio_files. Will also return """ if paths2audio_files is None or len(paths2audio_files) == 0: return {} # We will store transcriptions here hypotheses = [] all_hypotheses = [] # Model's mode and device mode = self.training device = next(self.parameters()).device dither_value = self.preprocessor.featurizer.dither pad_to_value = self.preprocessor.featurizer.pad_to if num_workers is None: num_workers = min(batch_size, os.cpu_count() - 1) try: self.preprocessor.featurizer.dither = 0.0 self.preprocessor.featurizer.pad_to = 0 # Switch model to evaluation mode self.eval() # Freeze the encoder and decoder modules self.encoder.freeze() self.decoder.freeze() self.joint.freeze() logging_level = logging.get_verbosity() logging.set_verbosity(logging.WARNING) # Work in tmp directory - will store manifest file there with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp: for audio_file in paths2audio_files: entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': ''} fp.write(json.dumps(entry) + '\n') config = { 'paths2audio_files': paths2audio_files, 'batch_size': batch_size, 'temp_dir': tmpdir, 'num_workers': num_workers, } temporary_datalayer = self._setup_transcribe_dataloader(config) for test_batch in tqdm(temporary_datalayer, desc="Transcribing"): encoded, encoded_len = self.forward( input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) ) best_hyp, all_hyp = self.decoding.rnnt_decoder_predictions_tensor( encoded, encoded_len, return_hypotheses=return_hypotheses, partial_hypotheses=partial_hypothesis, ) hypotheses += best_hyp if all_hyp is not None: all_hypotheses += all_hyp else: all_hypotheses += best_hyp del encoded del test_batch finally: # set mode back to its original value self.train(mode=mode) self.preprocessor.featurizer.dither = dither_value self.preprocessor.featurizer.pad_to = pad_to_value logging.set_verbosity(logging_level) if mode is True: self.encoder.unfreeze() self.decoder.unfreeze() self.joint.unfreeze() return hypotheses, all_hypotheses
def ptune_inference(self, queries: List[Dict], batch_size: int = 1, decode_token_len: int = None) -> List[str]: """ Get prediction for the queries Args: queries: List of data samples without labels batch_size: batch size to use during inference decode_token_len: max number of tokens to generate during inference Returns: all_preds: model predictions """ if decode_token_len is None: decode_token_len = self.decoder_seq_length # store predictions for all queries in a single list all_preds = [] mode = self.training try: # Switch model to evaluation mode self.eval() logging_level = logging.get_verbosity() logging.set_verbosity(logging.WARNING) dataloader_cfg = { "batch_size": batch_size, "num_workers": 3, "pin_memory": False } infer_datalayer = self._setup_infer_dataloader( dataloader_cfg, queries, decode_token_len) for i, batch in enumerate(infer_datalayer): tokens_enc = batch['text_enc'].to(self.device) enc_taskname = batch['enc_taskname'].to(self.device) enc_mask = batch['enc_mask'].to(self.device) input_embeds = self.embed_input(tokens_enc, enc_taskname) encoder_position_ids = build_position_ids(tokens_enc) position_embeddings = self.position_embeddings( encoder_position_ids) encoder_input = input_embeds + position_embeddings # loss, tokens_enc, labels, enc_mask, encoder_input = self.get_loss(batch) if self.float_type == torch.float32: predicted_token_ids, _ = self.model.decode( tokens_enc=tokens_enc, enc_mask=enc_mask, num_tokens_to_generate=decode_token_len, enc_input=encoder_input, ) else: with torch.autocast(device_type="cuda", dtype=self.float_type): predicted_token_ids, _ = self.model.decode( tokens_enc=tokens_enc, enc_mask=enc_mask, num_tokens_to_generate=decode_token_len, enc_input=encoder_input, ) preds = predicted_token_ids.cpu().numpy().tolist() for i, pred in enumerate(preds): if self.tokenizer.eos_id in pred: idx = pred.index(self.tokenizer.eos_id) pred = pred[:idx] pred = [ id for id in pred if id not in self.tokenizer.special_token_to_id.values() ] pred = self.tokenizer.ids_to_text(pred) all_preds.append(pred) finally: # set mode back to its original value self.train(mode=mode) logging.set_verbosity(logging_level) return all_preds
def transcribe_partial_audio( asr_model, path2manifest: str, batch_size: int = 4, logprobs: bool = False, return_hypotheses: bool = False, num_workers: int = 0, ) -> List[str]: assert isinstance(asr_model, EncDecCTCModel), "Currently support CTC model only." if return_hypotheses and logprobs: raise ValueError( "Either `return_hypotheses` or `logprobs` can be True at any given time." "Returned hypotheses will contain the logprobs.") if num_workers is None: num_workers = min(batch_size, os.cpu_count() - 1) # We will store transcriptions here hypotheses = [] # Model's mode and device mode = asr_model.training device = next(asr_model.parameters()).device dither_value = asr_model.preprocessor.featurizer.dither pad_to_value = asr_model.preprocessor.featurizer.pad_to try: asr_model.preprocessor.featurizer.dither = 0.0 asr_model.preprocessor.featurizer.pad_to = 0 # Switch model to evaluation mode asr_model.eval() # Freeze the encoder and decoder modules asr_model.encoder.freeze() asr_model.decoder.freeze() logging_level = logging.get_verbosity() logging.set_verbosity(logging.WARNING) config = { 'manifest_filepath': path2manifest, 'batch_size': batch_size, 'num_workers': num_workers, } temporary_datalayer = asr_model._setup_transcribe_dataloader(config) for test_batch in tqdm(temporary_datalayer, desc="Transcribing"): logits, logits_len, greedy_predictions = asr_model.forward( input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)) if logprobs: # dump log probs per file for idx in range(logits.shape[0]): lg = logits[idx][:logits_len[idx]] hypotheses.append(lg.cpu().numpy()) else: current_hypotheses = asr_model._wer.ctc_decoder_predictions_tensor( greedy_predictions, predictions_len=logits_len, return_hypotheses=return_hypotheses, ) if return_hypotheses: # dump log probs per file for idx in range(logits.shape[0]): current_hypotheses[idx].y_sequence = logits[ idx][:logits_len[idx]] hypotheses += current_hypotheses del greedy_predictions del logits del test_batch finally: # set mode back to its original value asr_model.train(mode=mode) asr_model.preprocessor.featurizer.dither = dither_value asr_model.preprocessor.featurizer.pad_to = pad_to_value if mode is True: asr_model.encoder.unfreeze() asr_model.decoder.unfreeze() logging.set_verbosity(logging_level) return hypotheses