def classifytext(self, queries: List[str], batch_size: int = 1, prompt: str = 'Sentiment') -> List[int]:
     """
     Get prediction for the queries
     Args:
         queries: text sequences
         batch_size: batch size to use during inference
         prompt: the prompt string appended at the end of your input sentence
     Returns:
         all_preds: model predictions
     """
     # store predictions for all queries in a single list
     all_preds = []
     mode = self.training
     try:
         # Switch model to evaluation mode
         self.eval()
         logging_level = logging.get_verbosity()
         logging.set_verbosity(logging.WARNING)
         dataloader_cfg = {"batch_size": batch_size, "num_workers": 3, "pin_memory": False}
         infer_datalayer = self._setup_infer_dataloader(dataloader_cfg, queries, prompt)
         for i, batch in enumerate(infer_datalayer):
             sentences, _ = batch
             preds = self.forward_eval(sentences)
             all_preds.extend([self.id_to_label[i.item()] for i in preds])
     finally:
         # set mode back to its original value
         self.train(mode=mode)
         logging.set_verbosity(logging_level)
     return all_preds
예제 #2
0
파일: rnnt_models.py 프로젝트: wgfi110/NeMo
    def transcribe(self,
                   paths2audio_files: List[str],
                   batch_size: int = 4) -> List[str]:
        """
        Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.

        Args:

            paths2audio_files: (a list) of paths to audio files. \
        Recommended length per file is between 5 and 25 seconds. \
        But it is possible to pass a few hours long file if enough GPU memory is available.
            batch_size: (int) batch size to use during inference. \
        Bigger will result in better throughput performance but would use more memory.

        Returns:

            A list of transcriptions in the same order as paths2audio_files
        """
        if paths2audio_files is None or len(paths2audio_files) == 0:
            return {}
        # We will store transcriptions here
        hypotheses = []
        # Model's mode and device
        mode = self.training
        device = next(self.parameters()).device
        try:
            # Switch model to evaluation mode
            self.eval()
            logging_level = logging.get_verbosity()
            logging.set_verbosity(logging.WARNING)
            # Work in tmp directory - will store manifest file there
            with tempfile.TemporaryDirectory() as tmpdir:
                with open(os.path.join(tmpdir, 'manifest.json'), 'w') as fp:
                    for audio_file in paths2audio_files:
                        entry = {
                            'audio_filepath': audio_file,
                            'duration': 100000,
                            'text': 'nothing'
                        }
                        fp.write(json.dumps(entry) + '\n')

                config = {
                    'paths2audio_files': paths2audio_files,
                    'batch_size': batch_size,
                    'temp_dir': tmpdir
                }

                temporary_datalayer = self._setup_transcribe_dataloader(config)
                for test_batch in temporary_datalayer:
                    encoded, encoded_len = self.forward(
                        input_signal=test_batch[0].to(device),
                        input_signal_length=test_batch[1].to(device))
                    hypotheses += self.decoding.rnnt_decoder_predictions_tensor(
                        encoded, encoded_len)
                    del test_batch
        finally:
            # set mode back to its original value
            self.train(mode=mode)
            logging.set_verbosity(logging_level)
        return hypotheses
예제 #3
0
    def test_SpectrogramAugmentationr_numba_kernel(self, caplog):
        numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__)

        logging._logger.propagate = True
        original_verbosity = logging.get_verbosity()
        logging.set_verbosity(logging.DEBUG)
        caplog.set_level(logging.DEBUG)

        # Make sure constructor works
        instance1 = modules.SpectrogramAugmentation(
            freq_masks=10, time_masks=3, rect_masks=3, use_numba_spec_augment=True
        )
        assert isinstance(instance1, modules.SpectrogramAugmentation)

        # Make sure forward doesn't throw with expected input
        instance0 = modules.AudioToMelSpectrogramPreprocessor(dither=0)
        input_signal = torch.randn(size=(8, 512))
        length = torch.randint(low=161, high=500, size=[8])
        res0 = instance0(input_signal=input_signal, length=length)
        res = instance1(input_spec=res0[0], length=length)

        assert res.shape == res0[0].shape

        # check tha numba kernel debug message indicates that it is available for use
        assert """Numba SpecAugment kernel is available""" in caplog.text

        logging._logger.propagate = False
        logging.set_verbosity(original_verbosity)
예제 #4
0
 def ptune_inference(self,
                     queries: List[Dict],
                     batch_size: int = 1,
                     decode_token_len: int = 5) -> List[str]:
     """
     Get prediction for the queries
     Args:
         queries: List of data samples without labels
         batch_size: batch size to use during inference
         decode_token_len: max number of tokens to generate during inference
     Returns:
         all_preds: model predictions
     """
     # store predictions for all queries in a single list
     all_preds = []
     mode = self.training
     try:
         # Switch model to evaluation mode
         self.eval()
         logging_level = logging.get_verbosity()
         logging.set_verbosity(logging.WARNING)
         dataloader_cfg = {
             "batch_size": batch_size,
             "num_workers": 3,
             "pin_memory": False
         }
         infer_datalayer = self._setup_infer_dataloader(
             dataloader_cfg, queries, decode_token_len)
         for i, batch in enumerate(infer_datalayer):
             enc_query = batch['enc_query'].to(self.device)
             label_position = batch['label_position'].to(self.device)
             enc_taskname = batch['enc_taskname'].to(self.device)
             # loss, tokens_enc, labels, enc_mask, encoder_input = self.get_loss(batch)
             predicted_token_ids, _ = self.decode(
                 enc_query=enc_query,
                 enc_taskname=enc_taskname,
                 label_position=label_position,
                 num_tokens_to_generate=self.num_tokens_to_gen,
             )
             preds = predicted_token_ids.cpu().numpy().tolist()
             label_positions = label_position.cpu().numpy().tolist()
             for i, (pred, label_position) in enumerate(
                     zip(preds, label_positions)):
                 start_position = label_position[0] + 1
                 pred = pred[start_position:]
                 if self.tokenizer.eos_id in pred:
                     idx = pred.index(self.tokenizer.eos_id)
                     pred = pred[:idx]
                 pred = [id for id in pred if id not in self.special_tokens]
                 pred = self.tokenizer.ids_to_text(pred)
                 all_preds.append(pred)
     finally:
         # set mode back to its original value
         self.train(mode=mode)
         logging.set_verbosity(logging_level)
     return all_preds
예제 #5
0
    def classifytext(self,
                     queries: List[str],
                     batch_size: int = 1,
                     max_seq_length: int = -1) -> List[int]:
        """
        Get prediction for the queries
        Args:
            queries: text sequences
            batch_size: batch size to use during inference
            max_seq_length: sequences longer than max_seq_length will get truncated. default -1 disables truncation.
        Returns:
            all_preds: model predictions
        """
        # store predictions for all queries in a single list
        all_preds = []
        mode = self.training
        device = next(self.parameters()).device
        try:
            # Switch model to evaluation mode
            self.eval()
            logging_level = logging.get_verbosity()
            logging.set_verbosity(logging.WARNING)
            dataloader_cfg = {
                "batch_size": batch_size,
                "num_workers": 3,
                "pin_memory": False
            }
            infer_datalayer = self._setup_infer_dataloader(
                dataloader_cfg, queries, max_seq_length)

            for i, batch in enumerate(infer_datalayer):
                input_ids, input_type_ids, input_mask, subtokens_mask = batch

                logits = self.forward(
                    input_ids=input_ids.to(device),
                    token_type_ids=input_type_ids.to(device),
                    attention_mask=input_mask.to(device),
                )

                preds = tensor2list(torch.argmax(logits, axis=-1))
                all_preds.extend(preds)
        finally:
            # set mode back to its original value
            self.train(mode=mode)
            logging.set_verbosity(logging_level)
        return all_preds
예제 #6
0
    def test_base_model_no_support_for_adapters(self, caplog):
        logging._logger.propagate = True
        original_verbosity = logging.get_verbosity()
        logging.set_verbosity(logging.WARNING)
        caplog.set_level(logging.WARNING)

        cfg = get_model_config(in_features=50, update_adapter_cfg=False)
        model = DefaultAdapterModel(cfg)

        with pytest.raises(AttributeError):
            model.add_adapter(name='adapter_0', cfg=get_adapter_cfg())

        # check that warning message indicates that it module is not available
        assert """Encoder does not support adapters !""" in caplog.text

        caplog.clear()
        model.get_enabled_adapters()

        # check that there is not warning message, since it should log only once.
        assert """Encoder does not support adapters !""" not in caplog.text

        logging._logger.propagate = False
        logging.set_verbosity(original_verbosity)
예제 #7
0
    def transcribe(self,
                   paths2audio_files: List[str],
                   batch_size: int = 4,
                   logprobs=False) -> List[str]:
        """
        Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.

        Args:

            paths2audio_files: (a list) of paths to audio files. \
        Recommended length per file is between 5 and 25 seconds. \
        But it is possible to pass a few hours long file if enough GPU memory is available.
            batch_size: (int) batch size to use during inference. \
        Bigger will result in better throughput performance but would use more memory.
            logprobs: (bool) pass True to get log probabilities instead of transcripts.

        Returns:

            A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files
        """
        if paths2audio_files is None or len(paths2audio_files) == 0:
            return {}
        # We will store transcriptions here
        hypotheses = []
        # Model's mode and device
        mode = self.training
        device = next(self.parameters()).device
        dither_value = self.preprocessor.featurizer.dither
        pad_to_value = self.preprocessor.featurizer.pad_to

        try:
            self.preprocessor.featurizer.dither = 0.0
            self.preprocessor.featurizer.pad_to = 0
            # Switch model to evaluation mode
            self.eval()
            logging_level = logging.get_verbosity()
            logging.set_verbosity(logging.WARNING)
            # Work in tmp directory - will store manifest file there
            with tempfile.TemporaryDirectory() as tmpdir:
                with open(os.path.join(tmpdir, 'manifest.json'), 'w') as fp:
                    for audio_file in paths2audio_files:
                        entry = {
                            'audio_filepath': audio_file,
                            'duration': 100000,
                            'text': 'nothing'
                        }
                        fp.write(json.dumps(entry) + '\n')

                config = {
                    'paths2audio_files': paths2audio_files,
                    'batch_size': batch_size,
                    'temp_dir': tmpdir
                }

                temporary_datalayer = self._setup_transcribe_dataloader(config)
                for test_batch in temporary_datalayer:
                    logits, logits_len, greedy_predictions = self.forward(
                        input_signal=test_batch[0].to(device),
                        input_signal_length=test_batch[1].to(device))
                    if logprobs:
                        # dump log probs per file
                        for idx in range(logits.shape[0]):
                            hypotheses.append(logits[idx][:logits_len[idx]])
                    else:
                        hypotheses += self._wer.ctc_decoder_predictions_tensor(
                            greedy_predictions, predictions_len=logits_len)
                    del test_batch
        finally:
            # set mode back to its original value
            self.train(mode=mode)
            self.preprocessor.featurizer.dither = dither_value
            self.preprocessor.featurizer.pad_to = pad_to_value
            logging.set_verbosity(logging_level)
        return hypotheses
예제 #8
0
    def transcribe(self,
                   paths2audio_files: List[str],
                   batch_size: int = 4,
                   logprobs=False) -> List[str]:
        """
        Generate class labels for provided audio files. Use this method for debugging and prototyping.

        Args:
            paths2audio_files: (a list) of paths to audio files. \
                Recommended length per file is approximately 1 second.
            batch_size: (int) batch size to use during inference. \
                Bigger will result in better throughput performance but would use more memory.
            logprobs: (bool) pass True to get log probabilities instead of class labels.

        Returns:

            A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files
        """
        if paths2audio_files is None or len(paths2audio_files) == 0:
            return []
        # We will store transcriptions here
        labels = []
        # Model's mode and device
        mode = self.training
        device = next(self.parameters()).device
        dither_value = self.preprocessor.featurizer.dither
        pad_to_value = self.preprocessor.featurizer.pad_to

        try:
            self.preprocessor.featurizer.dither = 0.0
            self.preprocessor.featurizer.pad_to = 0
            # Switch model to evaluation mode
            self.eval()
            logging_level = logging.get_verbosity()
            logging.set_verbosity(logging.WARNING)
            # Work in tmp directory - will store manifest file there
            with tempfile.TemporaryDirectory() as tmpdir:
                with open(os.path.join(tmpdir, 'manifest.json'), 'w') as fp:
                    for audio_file in paths2audio_files:
                        label = 0.0 if self.is_regression_task else self.cfg.labels[
                            0]
                        entry = {
                            'audio_filepath': audio_file,
                            'duration': 100000.0,
                            'label': label
                        }
                        fp.write(json.dumps(entry) + '\n')

                config = {
                    'paths2audio_files': paths2audio_files,
                    'batch_size': batch_size,
                    'temp_dir': tmpdir
                }

                temporary_datalayer = self._setup_transcribe_dataloader(config)
                for test_batch in temporary_datalayer:
                    logits = self.forward(
                        input_signal=test_batch[0].to(device),
                        input_signal_length=test_batch[1].to(device))
                    if logprobs:
                        # dump log probs per file
                        for idx in range(logits.shape[0]):
                            lg = logits[idx]
                            labels.append(lg.cpu().numpy())
                    else:
                        labels_k = []
                        top_ks = self._accuracy.top_k
                        for top_k_i in top_ks:
                            # replace top k value with current top k
                            self._accuracy.top_k = top_k_i
                            labels_k_i = self._accuracy.top_k_predicted_labels(
                                logits)
                            labels_k.append(labels_k_i)

                        # convenience: if only one top_k, pop out the nested list
                        if len(top_ks) == 1:
                            labels_k = labels_k[0]

                        labels += labels_k
                        # reset top k to orignal value
                        self._accuracy.top_k = top_ks
                    del test_batch
        finally:
            # set mode back to its original value
            self.train(mode=mode)
            self.preprocessor.featurizer.dither = dither_value
            self.preprocessor.featurizer.pad_to = pad_to_value
            logging.set_verbosity(logging_level)
        return labels
예제 #9
0
    def inference(
        self,
        file: str,
        batch_size: int = 1,
        num_samples: int = -1,
        output_nbest_file: Optional[str] = None,
        output_prediction_file: Optional[str] = None,
    ):
        """
        Get prediction for unlabeled inference data

        Args:
            file: inference data
            batch_size: batch size to use during inference
            num_samples: number of samples to use of inference data. Default: -1 if all data should be used.
            output_nbest_file: optional output file for writing out nbest list
            output_prediction_file: optional output file for writing out predictions
            
        Returns:
            model predictions, model nbest list
        """
        # store predictions for all queries in a single list
        all_predictions = []
        all_nbest = []
        mode = self.training
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        try:
            # Switch model to evaluation mode
            self.eval()
            self.to(device)
            logging_level = logging.get_verbosity()
            logging.set_verbosity(logging.WARNING)
            dataloader_cfg = {
                "batch_size": batch_size,
                "file": file,
                "shuffle": False,
                "num_samples": num_samples,
                'num_workers': 2,
                'pin_memory': False,
                'drop_last': False,
            }
            dataloader_cfg = OmegaConf.create(dataloader_cfg)
            infer_datalayer = self._setup_dataloader_from_config(
                cfg=dataloader_cfg, mode=INFERENCE_MODE)

            all_logits = []
            all_unique_ids = []
            for i, batch in enumerate(infer_datalayer):
                input_ids, token_type_ids, attention_mask, unique_ids = batch
                logits = self.forward(
                    input_ids=input_ids.to(device),
                    token_type_ids=token_type_ids.to(device),
                    attention_mask=attention_mask.to(device),
                )
                all_logits.append(logits)
                all_unique_ids.append(unique_ids)
            logits = torch.cat(all_logits)
            unique_ids = tensor2list(torch.cat(all_unique_ids))
            s, e = logits.split(dim=-1, split_size=1)
            start_logits = tensor2list(s.squeeze(-1))
            end_logits = tensor2list(e.squeeze(-1))
            (all_predictions, all_nbest,
             scores_diff) = infer_datalayer.dataset.get_predictions(
                 unique_ids=unique_ids,
                 start_logits=start_logits,
                 end_logits=end_logits,
                 n_best_size=self._cfg.dataset.n_best_size,
                 max_answer_length=self._cfg.dataset.max_answer_length,
                 version_2_with_negative=self._cfg.dataset.
                 version_2_with_negative,
                 null_score_diff_threshold=self._cfg.dataset.
                 null_score_diff_threshold,
                 do_lower_case=self._cfg.dataset.do_lower_case,
             )

            with open(file, 'r') as test_file_fp:
                test_data = json.load(test_file_fp)["data"]
                id_to_question_mapping = {}
                for title in test_data:
                    for par in title["paragraphs"]:
                        for question in par["qas"]:
                            id_to_question_mapping[
                                question["id"]] = question["question"]

            for question_id in all_predictions:
                all_predictions[question_id] = (
                    id_to_question_mapping[question_id],
                    all_predictions[question_id])

            if output_nbest_file is not None:
                with open(output_nbest_file, "w") as writer:
                    writer.write(json.dumps(all_nbest, indent=4) + "\n")
            if output_prediction_file is not None:
                with open(output_prediction_file, "w") as writer:
                    writer.write(json.dumps(all_predictions, indent=4) + "\n")

        finally:
            # set mode back to its original value
            self.train(mode=mode)
            logging.set_verbosity(logging_level)

        return all_predictions, all_nbest
예제 #10
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--asr_model",
        type=str,
        default="QuartzNet15x5Base-En",
        choices=[
            x.pretrained_model_name
            for x in EncDecCTCModel.list_available_models()
        ],
    )
    parser.add_argument(
        "--tts_model_spec",
        type=str,
        default="Tacotron2-22050Hz",
        choices=[
            x.pretrained_model_name
            for x in SpectrogramGenerator.list_available_models()
        ],
    )
    parser.add_argument(
        "--tts_model_vocoder",
        type=str,
        default="WaveGlow-22050Hz",
        choices=[
            x.pretrained_model_name for x in Vocoder.list_available_models()
        ],
    )
    parser.add_argument("--wer_tolerance",
                        type=float,
                        default=1.0,
                        help="used by test")
    parser.add_argument("--trim", action="store_true")
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()
    torch.set_grad_enabled(False)

    if args.debug:
        logging.set_verbosity(logging.DEBUG)

    logging.info(f"Using NGC cloud ASR model {args.asr_model}")
    asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model)
    logging.info(
        f"Using NGC cloud TTS Spectrogram Generator model {args.tts_model_spec}"
    )
    tts_model_spec = SpectrogramGenerator.from_pretrained(
        model_name=args.tts_model_spec)
    logging.info(f"Using NGC cloud TTS Vocoder model {args.tts_model_vocoder}")
    tts_model_vocoder = Vocoder.from_pretrained(
        model_name=args.tts_model_vocoder)
    models = [asr_model, tts_model_spec, tts_model_vocoder]

    if torch.cuda.is_available():
        for i, m in enumerate(models):
            models[i] = m.cuda()
    for m in models:
        m.eval()

    asr_model, tts_model_spec, tts_model_vocoder = models

    parser = parsers.make_parser(
        labels=asr_model.decoder.vocabulary,
        name="en",
        unk_id=-1,
        blank_id=-1,
        do_normalize=True,
    )
    labels_map = dict([(i, asr_model.decoder.vocabulary[i])
                       for i in range(len(asr_model.decoder.vocabulary))])

    tts_input = []
    asr_references = []
    longest_tts_input = 0
    for test_str in LIST_OF_TEST_STRINGS:
        tts_parsed_input = tts_model_spec.parse(test_str)
        if len(tts_parsed_input[0]) > longest_tts_input:
            longest_tts_input = len(tts_parsed_input[0])
        tts_input.append(tts_parsed_input.squeeze())

        asr_parsed = parser(test_str)
        asr_parsed = ''.join([labels_map[c] for c in asr_parsed])
        asr_references.append(asr_parsed)

    # Pad TTS Inputs
    for i, text in enumerate(tts_input):
        pad = (0, longest_tts_input - len(text))
        tts_input[i] = torch.nn.functional.pad(text, pad, value=68)

    logging.debug(tts_input)

    # Do TTS
    tts_input = torch.stack(tts_input)
    if torch.cuda.is_available():
        tts_input = tts_input.cuda()
    specs = tts_model_spec.generate_spectrogram(tokens=tts_input)
    audio = []
    step = ceil(len(specs) / 4)
    for i in range(4):
        audio.append(
            tts_model_vocoder.convert_spectrogram_to_audio(
                spec=specs[i * step:i * step + step]))

    audio = [item for sublist in audio for item in sublist]
    audio_file_paths = []
    # Save audio
    logging.debug(f"args.trim: {args.trim}")
    for i, aud in enumerate(audio):
        aud = aud.cpu().numpy()
        if args.trim:
            aud = librosa.effects.trim(aud, top_db=40)[0]
        librosa.output.write_wav(f"{i}.wav", aud, sr=22050)
        audio_file_paths.append(str(Path(f"{i}.wav")))

    # Do ASR
    hypotheses = asr_model.transcribe(audio_file_paths)
    for i, _ in enumerate(hypotheses):
        logging.debug(f"{i}")
        logging.debug(f"ref:'{asr_references[i]}'")
        logging.debug(f"hyp:'{hypotheses[i]}'")
    wer_value = word_error_rate(hypotheses=hypotheses,
                                references=asr_references)
    if wer_value > args.wer_tolerance:
        raise ValueError(
            f"Got WER of {wer_value}. It was higher than {args.wer_tolerance}")
    logging.info(f'Got WER of {wer_value}. Tolerance was {args.wer_tolerance}')
예제 #11
0
    def transcribe(
        self,
        paths2audio_files: List[str],
        batch_size: int = 4,
        logprobs: bool = False,
        return_hypotheses: bool = False,
        num_workers: int = 0,
    ) -> List[str]:
        """
        Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.

        Args:
            paths2audio_files: (a list) of paths to audio files. \
                Recommended length per file is between 5 and 25 seconds. \
                But it is possible to pass a few hours long file if enough GPU memory is available.
            batch_size: (int) batch size to use during inference.
                Bigger will result in better throughput performance but would use more memory.
            logprobs: (bool) pass True to get log probabilities instead of transcripts.
            return_hypotheses: (bool) Either return hypotheses or text
                With hypotheses can do some postprocessing like getting timestamp or rescoring
            num_workers: (int) number of workers for DataLoader

        Returns:
            A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files
        """
        if paths2audio_files is None or len(paths2audio_files) == 0:
            return {}

        if return_hypotheses and logprobs:
            raise ValueError(
                "Either `return_hypotheses` or `logprobs` can be True at any given time."
                "Returned hypotheses will contain the logprobs."
            )

        if num_workers is None:
            num_workers = min(batch_size, os.cpu_count() - 1)

        # We will store transcriptions here
        hypotheses = []
        # Model's mode and device
        mode = self.training
        device = next(self.parameters()).device
        dither_value = self.preprocessor.featurizer.dither
        pad_to_value = self.preprocessor.featurizer.pad_to

        try:
            self.preprocessor.featurizer.dither = 0.0
            self.preprocessor.featurizer.pad_to = 0
            # Switch model to evaluation mode
            self.eval()
            # Freeze the encoder and decoder modules
            self.encoder.freeze()
            self.decoder.freeze()
            logging_level = logging.get_verbosity()
            logging.set_verbosity(logging.WARNING)
            # Work in tmp directory - will store manifest file there
            with tempfile.TemporaryDirectory() as tmpdir:
                with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp:
                    for audio_file in paths2audio_files:
                        entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': ''}
                        fp.write(json.dumps(entry) + '\n')

                config = {
                    'paths2audio_files': paths2audio_files,
                    'batch_size': batch_size,
                    'temp_dir': tmpdir,
                    'num_workers': num_workers,
                }

                temporary_datalayer = self._setup_transcribe_dataloader(config)
                for test_batch in tqdm(temporary_datalayer, desc="Transcribing"):
                    logits, logits_len, greedy_predictions = self.forward(
                        input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)
                    )
                    if logprobs:
                        # dump log probs per file
                        for idx in range(logits.shape[0]):
                            lg = logits[idx][: logits_len[idx]]
                            hypotheses.append(lg.cpu().numpy())
                    else:
                        current_hypotheses = self._wer.ctc_decoder_predictions_tensor(
                            greedy_predictions, predictions_len=logits_len, return_hypotheses=return_hypotheses,
                        )

                        if return_hypotheses:
                            # dump log probs per file
                            for idx in range(logits.shape[0]):
                                current_hypotheses[idx].y_sequence = logits[idx][: logits_len[idx]]

                        hypotheses += current_hypotheses

                    del greedy_predictions
                    del logits
                    del test_batch
        finally:
            # set mode back to its original value
            self.train(mode=mode)
            self.preprocessor.featurizer.dither = dither_value
            self.preprocessor.featurizer.pad_to = pad_to_value
            if mode is True:
                self.encoder.unfreeze()
                self.decoder.unfreeze()
            logging.set_verbosity(logging_level)
        return hypotheses
예제 #12
0
    def transcribe(
        self,
        paths2audio_files: List[str],
        batch_size: int = 4,
        return_hypotheses: bool = False,
        partial_hypothesis: Optional[List['Hypothesis']] = None,
        num_workers: int = 0,
    ) -> (List[str], Optional[List['Hypothesis']]):
        """
        Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.

        Args:

            paths2audio_files: (a list) of paths to audio files. \
        Recommended length per file is between 5 and 25 seconds. \
        But it is possible to pass a few hours long file if enough GPU memory is available.
            batch_size: (int) batch size to use during inference. \
        Bigger will result in better throughput performance but would use more memory.
            return_hypotheses: (bool) Either return hypotheses or text
        With hypotheses can do some postprocessing like getting timestamp or rescoring
            num_workers: (int) number of workers for DataLoader

        Returns:
            A list of transcriptions in the same order as paths2audio_files. Will also return
        """
        if paths2audio_files is None or len(paths2audio_files) == 0:
            return {}
        # We will store transcriptions here
        hypotheses = []
        all_hypotheses = []
        # Model's mode and device
        mode = self.training
        device = next(self.parameters()).device
        dither_value = self.preprocessor.featurizer.dither
        pad_to_value = self.preprocessor.featurizer.pad_to

        if num_workers is None:
            num_workers = min(batch_size, os.cpu_count() - 1)

        try:
            self.preprocessor.featurizer.dither = 0.0
            self.preprocessor.featurizer.pad_to = 0

            # Switch model to evaluation mode
            self.eval()
            # Freeze the encoder and decoder modules
            self.encoder.freeze()
            self.decoder.freeze()
            self.joint.freeze()
            logging_level = logging.get_verbosity()
            logging.set_verbosity(logging.WARNING)
            # Work in tmp directory - will store manifest file there
            with tempfile.TemporaryDirectory() as tmpdir:
                with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp:
                    for audio_file in paths2audio_files:
                        entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': ''}
                        fp.write(json.dumps(entry) + '\n')

                config = {
                    'paths2audio_files': paths2audio_files,
                    'batch_size': batch_size,
                    'temp_dir': tmpdir,
                    'num_workers': num_workers,
                }

                temporary_datalayer = self._setup_transcribe_dataloader(config)
                for test_batch in tqdm(temporary_datalayer, desc="Transcribing"):
                    encoded, encoded_len = self.forward(
                        input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)
                    )
                    best_hyp, all_hyp = self.decoding.rnnt_decoder_predictions_tensor(
                        encoded,
                        encoded_len,
                        return_hypotheses=return_hypotheses,
                        partial_hypotheses=partial_hypothesis,
                    )

                    hypotheses += best_hyp
                    if all_hyp is not None:
                        all_hypotheses += all_hyp
                    else:
                        all_hypotheses += best_hyp

                    del encoded
                    del test_batch
        finally:
            # set mode back to its original value
            self.train(mode=mode)
            self.preprocessor.featurizer.dither = dither_value
            self.preprocessor.featurizer.pad_to = pad_to_value

            logging.set_verbosity(logging_level)
            if mode is True:
                self.encoder.unfreeze()
                self.decoder.unfreeze()
                self.joint.unfreeze()
        return hypotheses, all_hypotheses
예제 #13
0
    def ptune_inference(self,
                        queries: List[Dict],
                        batch_size: int = 1,
                        decode_token_len: int = None) -> List[str]:
        """
        Get prediction for the queries
        Args:
            queries: List of data samples without labels
            batch_size: batch size to use during inference
            decode_token_len: max number of tokens to generate during inference
        Returns:
            all_preds: model predictions
        """
        if decode_token_len is None:
            decode_token_len = self.decoder_seq_length
        # store predictions for all queries in a single list
        all_preds = []
        mode = self.training
        try:
            # Switch model to evaluation mode
            self.eval()
            logging_level = logging.get_verbosity()
            logging.set_verbosity(logging.WARNING)
            dataloader_cfg = {
                "batch_size": batch_size,
                "num_workers": 3,
                "pin_memory": False
            }
            infer_datalayer = self._setup_infer_dataloader(
                dataloader_cfg, queries, decode_token_len)
            for i, batch in enumerate(infer_datalayer):
                tokens_enc = batch['text_enc'].to(self.device)
                enc_taskname = batch['enc_taskname'].to(self.device)
                enc_mask = batch['enc_mask'].to(self.device)

                input_embeds = self.embed_input(tokens_enc, enc_taskname)

                encoder_position_ids = build_position_ids(tokens_enc)

                position_embeddings = self.position_embeddings(
                    encoder_position_ids)

                encoder_input = input_embeds + position_embeddings

                # loss, tokens_enc, labels, enc_mask, encoder_input = self.get_loss(batch)
                if self.float_type == torch.float32:
                    predicted_token_ids, _ = self.model.decode(
                        tokens_enc=tokens_enc,
                        enc_mask=enc_mask,
                        num_tokens_to_generate=decode_token_len,
                        enc_input=encoder_input,
                    )
                else:
                    with torch.autocast(device_type="cuda",
                                        dtype=self.float_type):
                        predicted_token_ids, _ = self.model.decode(
                            tokens_enc=tokens_enc,
                            enc_mask=enc_mask,
                            num_tokens_to_generate=decode_token_len,
                            enc_input=encoder_input,
                        )

                preds = predicted_token_ids.cpu().numpy().tolist()
                for i, pred in enumerate(preds):
                    if self.tokenizer.eos_id in pred:
                        idx = pred.index(self.tokenizer.eos_id)
                        pred = pred[:idx]
                    pred = [
                        id for id in pred if id not in
                        self.tokenizer.special_token_to_id.values()
                    ]
                    pred = self.tokenizer.ids_to_text(pred)
                    all_preds.append(pred)
        finally:
            # set mode back to its original value
            self.train(mode=mode)
            logging.set_verbosity(logging_level)
        return all_preds
예제 #14
0
def transcribe_partial_audio(
    asr_model,
    path2manifest: str,
    batch_size: int = 4,
    logprobs: bool = False,
    return_hypotheses: bool = False,
    num_workers: int = 0,
) -> List[str]:

    assert isinstance(asr_model,
                      EncDecCTCModel), "Currently support CTC model only."

    if return_hypotheses and logprobs:
        raise ValueError(
            "Either `return_hypotheses` or `logprobs` can be True at any given time."
            "Returned hypotheses will contain the logprobs.")
    if num_workers is None:
        num_workers = min(batch_size, os.cpu_count() - 1)

    # We will store transcriptions here
    hypotheses = []
    # Model's mode and device
    mode = asr_model.training
    device = next(asr_model.parameters()).device
    dither_value = asr_model.preprocessor.featurizer.dither
    pad_to_value = asr_model.preprocessor.featurizer.pad_to

    try:
        asr_model.preprocessor.featurizer.dither = 0.0
        asr_model.preprocessor.featurizer.pad_to = 0
        # Switch model to evaluation mode
        asr_model.eval()
        # Freeze the encoder and decoder modules
        asr_model.encoder.freeze()
        asr_model.decoder.freeze()
        logging_level = logging.get_verbosity()
        logging.set_verbosity(logging.WARNING)

        config = {
            'manifest_filepath': path2manifest,
            'batch_size': batch_size,
            'num_workers': num_workers,
        }

        temporary_datalayer = asr_model._setup_transcribe_dataloader(config)
        for test_batch in tqdm(temporary_datalayer, desc="Transcribing"):
            logits, logits_len, greedy_predictions = asr_model.forward(
                input_signal=test_batch[0].to(device),
                input_signal_length=test_batch[1].to(device))
            if logprobs:
                # dump log probs per file
                for idx in range(logits.shape[0]):
                    lg = logits[idx][:logits_len[idx]]
                    hypotheses.append(lg.cpu().numpy())
            else:
                current_hypotheses = asr_model._wer.ctc_decoder_predictions_tensor(
                    greedy_predictions,
                    predictions_len=logits_len,
                    return_hypotheses=return_hypotheses,
                )

                if return_hypotheses:
                    # dump log probs per file
                    for idx in range(logits.shape[0]):
                        current_hypotheses[idx].y_sequence = logits[
                            idx][:logits_len[idx]]

                hypotheses += current_hypotheses

            del greedy_predictions
            del logits
            del test_batch

    finally:
        # set mode back to its original value
        asr_model.train(mode=mode)
        asr_model.preprocessor.featurizer.dither = dither_value
        asr_model.preprocessor.featurizer.pad_to = pad_to_value
        if mode is True:
            asr_model.encoder.unfreeze()
            asr_model.decoder.unfreeze()
        logging.set_verbosity(logging_level)
    return hypotheses