def __init__(self, cfg: DictConfig, trainer=None): if 'tokenizer' not in cfg: raise ValueError( "`cfg` must have `tokenizer` config to create a tokenizer !") self.tokenizer_cfg = OmegaConf.to_container(cfg.tokenizer, resolve=True) # type: dict self.tokenizer_dir = self.tokenizer_cfg.pop( 'dir') # Remove tokenizer directory self.tokenizer_type = self.tokenizer_cfg.pop( 'type').lower() # Remove tokenizer_type # Setup the tokenizer self._setup_tokenizer() # Initialize a dummy vocabulary vocabulary = self.tokenizer.tokenizer.get_vocab() # Set the new vocabulary cfg.decoder.params.vocabulary = ListConfig(list(vocabulary.values())) # Override number of classes if placeholder provided if cfg.decoder.params['num_classes'] < 1: logging.info( "\nReplacing placeholder number of classes ({}) with actual number of classes - {}" .format(cfg.decoder.params['num_classes'], len(vocabulary))) cfg.decoder.params['num_classes'] = len(vocabulary) super().__init__(cfg=cfg, trainer=trainer) # Setup metric objects self._wer = WERBPE(tokenizer=self.tokenizer, batch_dim_index=0, use_cer=False, ctc_decode=True)
def test_wer_metric_randomized(self, test_wer_bpe): """This test relies on correctness of word_error_rate function.""" def __random_string(length): return ''.join( random.choice(''.join(self.vocabulary)) for _ in range(length)) if test_wer_bpe: wer = WERBPE(deepcopy(self.char_tokenizer), batch_dim_index=0, use_cer=False, ctc_decode=True) else: wer = WER(vocabulary=self.vocabulary, batch_dim_index=0, use_cer=False, ctc_decode=True) for test_id in range(256): n1 = random.randint(1, 512) n2 = random.randint(1, 512) s1 = __random_string(n1) s2 = __random_string(n2) # skip empty strings as reference if s2.strip(): assert (abs( self.get_wer(wer, prediction=s1, reference=s2, use_tokenizer=test_wer_bpe) - word_error_rate(hypotheses=[s1], references=[s2])) < 1e-6)
def change_vocabulary(self, new_tokenizer_dir: str, new_tokenizer_type: str): """ Changes vocabulary of the tokenizer used during CTC decoding process. Use this method when fine-tuning on from pre-trained model. This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need model to learn capitalization, punctuation and/or special characters. Args: new_tokenizer_dir: Path to the new tokenizer directory. new_tokenizer_type: Either `bpe` or `wpe`. `bpe` is used for SentencePiece tokenizers, whereas `wpe` is used for `BertTokenizer`. Returns: None """ if not os.path.isdir(new_tokenizer_dir): raise NotADirectoryError( f'New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}' ) if new_tokenizer_type.lower() not in ('bpe', 'wpe'): raise ValueError(f'New tokenizer type must be either `bpe` or `wpe`') self.tokenizer_dir = new_tokenizer_dir # Remove tokenizer directory self.tokenizer_type = new_tokenizer_type.lower() # Remove tokenizer_type # Setup the tokenizer self._setup_tokenizer() # Initialize a dummy vocabulary vocabulary = self.tokenizer.tokenizer.get_vocab() # Set the new vocabulary decoder_config = copy.deepcopy(self.decoder.to_config_dict()) decoder_config.params.vocabulary = ListConfig(list(vocabulary.values())) # Override number of classes if placeholder provided logging.info( "\nReplacing old number of classes ({}) with new number of classes - {}".format( decoder_config['params']['num_classes'], len(vocabulary) ) ) decoder_config['params']['num_classes'] = len(vocabulary) del self.decoder self.decoder = EncDecCTCModelBPE.from_config_dict(decoder_config) del self.loss self.loss = CTCLoss(num_classes=self.decoder.num_classes_with_blank - 1, zero_infinity=True) self._wer = WERBPE(tokenizer=self.tokenizer, batch_dim_index=0, use_cer=False, ctc_decode=True) # Update config OmegaConf.set_struct(self._cfg.decoder, False) self._cfg.decoder = decoder_config OmegaConf.set_struct(self._cfg.decoder, True) logging.info(f"Changed tokenizer to {self.decoder.vocabulary} vocabulary.")
def test_wer_metric_decode(self, test_wer_bpe): if test_wer_bpe: wer = WERBPE(self.char_tokenizer, batch_dim_index=0, use_cer=False, ctc_decode=True) else: wer = WER(vocabulary=self.vocabulary.copy(), batch_dim_index=0, use_cer=False, ctc_decode=True) tokens = self.__string_to_ctc_tensor( 'cat', use_tokenizer=test_wer_bpe)[0].int().numpy().tolist() assert tokens == [3, 1, 20] tokens_decoded = wer.decode_ids_to_tokens(tokens) assert tokens_decoded == ['c', 'a', 't'] str_decoded = wer.decode_tokens_to_str(tokens) assert str_decoded == 'cat'
def __init__(self, cfg: DictConfig, trainer=None): if 'tokenizer' not in cfg: raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !") # Setup the tokenizer self._setup_tokenizer(cfg.tokenizer) # Initialize a dummy vocabulary vocabulary = self.tokenizer.tokenizer.get_vocab() # Set the new vocabulary with open_dict(cfg): if "params" in cfg.decoder: cfg.decoder.params.vocabulary = ListConfig(list(vocabulary.values())) else: cfg.decoder.vocabulary = ListConfig(list(vocabulary.values())) # Override number of classes if placeholder provided if "params" in cfg.decoder: num_classes = cfg.decoder["params"]["num_classes"] else: num_classes = cfg.decoder["num_classes"] if num_classes < 1: logging.info( "\nReplacing placeholder number of classes ({}) with actual number of classes - {}".format( num_classes, len(vocabulary) ) ) if "params" in cfg.decoder: cfg.decoder["params"]["num_classes"] = len(vocabulary) else: cfg.decoder["num_classes"] = len(vocabulary) super().__init__(cfg=cfg, trainer=trainer) # Setup metric objects self._wer = WERBPE( tokenizer=self.tokenizer, batch_dim_index=0, use_cer=self._cfg.get('use_cer', False), ctc_decode=True, dist_sync_on_step=True, log_prediction=self._cfg.get("log_prediction", False), )
def __init__(self, cfg: DictConfig, trainer=None): # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) if 'tokenizer' not in cfg: raise ValueError( "`cfg` must have `tokenizer` config to create a tokenizer !") # Setup the tokenizer self._setup_tokenizer(cfg.tokenizer) # Initialize a dummy vocabulary vocabulary = self.tokenizer.tokenizer.get_vocab() # Set the new vocabulary with open_dict(cfg): # sidestepping the potential overlapping tokens issue in aggregate tokenizers if self.tokenizer_type == "agg": cfg.decoder.vocabulary = ListConfig(vocabulary) else: cfg.decoder.vocabulary = ListConfig(list(vocabulary.keys())) # Override number of classes if placeholder provided num_classes = cfg.decoder["num_classes"] if num_classes < 1: logging.info( "\nReplacing placeholder number of classes ({}) with actual number of classes - {}" .format(num_classes, len(vocabulary))) cfg.decoder["num_classes"] = len(vocabulary) super().__init__(cfg=cfg, trainer=trainer) # Setup metric objects self._wer = WERBPE( tokenizer=self.tokenizer, batch_dim_index=0, use_cer=self._cfg.get('use_cer', False), ctc_decode=True, dist_sync_on_step=True, log_prediction=self._cfg.get("log_prediction", False), )
def test_wer_metric_simple(self, batch_dim_index, test_wer_bpe): if test_wer_bpe: wer = WERBPE(self.char_tokenizer, batch_dim_index, use_cer=False, ctc_decode=True) else: wer = WER(vocabulary=self.vocabulary, batch_dim_index=batch_dim_index, use_cer=False, ctc_decode=True) assert self.get_wer(wer, 'cat', 'cot', test_wer_bpe) == 1.0 assert self.get_wer(wer, 'gpu', 'g p u', test_wer_bpe) == 1.0 assert self.get_wer(wer, 'g p u', 'gpu', test_wer_bpe) == 3.0 assert self.get_wer(wer, 'ducati motorcycle', 'motorcycle', test_wer_bpe) == 1.0 assert self.get_wer(wer, 'ducati motorcycle', 'ducuti motorcycle', test_wer_bpe) == 0.5 assert abs( self.get_wer(wer, 'a f c', 'a b c', test_wer_bpe) - 1.0 / 3.0) < 1e-6
def change_vocabulary(self, new_tokenizer_dir: Union[str, DictConfig], new_tokenizer_type: str): """ Changes vocabulary of the tokenizer used during CTC decoding process. Use this method when fine-tuning on from pre-trained model. This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need model to learn capitalization, punctuation and/or special characters. Args: new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer (if the tokenizer type is `agg`) new_tokenizer_type: Either `agg`, `bpe` or `wpe`. `bpe` is used for SentencePiece tokenizers, whereas `wpe` is used for `BertTokenizer`. new_tokenizer_cfg: A config for the new tokenizer. if provided, pre-empts the dir and type Returns: None """ if isinstance(new_tokenizer_dir, DictConfig): if new_tokenizer_type == 'agg': new_tokenizer_cfg = new_tokenizer_dir else: raise ValueError( f'New tokenizer dir should be a string unless the tokenizer is `agg`, but this tokenizer type is: {new_tokenizer_type}' ) else: new_tokenizer_cfg = None if new_tokenizer_cfg is not None: tokenizer_cfg = new_tokenizer_cfg else: if not os.path.isdir(new_tokenizer_dir): raise NotADirectoryError( f'New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}' f"New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}" ) if new_tokenizer_type.lower() not in ('bpe', 'wpe'): raise ValueError( f'New tokenizer type must be either `bpe` or `wpe`') tokenizer_cfg = OmegaConf.create({ 'dir': new_tokenizer_dir, 'type': new_tokenizer_type }) # Setup the tokenizer self._setup_tokenizer(tokenizer_cfg) # Initialize a dummy vocabulary vocabulary = self.tokenizer.tokenizer.get_vocab() # Set the new vocabulary decoder_config = copy.deepcopy(self.decoder.to_config_dict()) # sidestepping the potential overlapping tokens issue in aggregate tokenizers if self.tokenizer_type == "agg": decoder_config.vocabulary = ListConfig(vocabulary) else: decoder_config.vocabulary = ListConfig(list(vocabulary.keys())) decoder_num_classes = decoder_config['num_classes'] # Override number of classes if placeholder provided logging.info( "\nReplacing old number of classes ({}) with new number of classes - {}" .format(decoder_num_classes, len(vocabulary))) decoder_config['num_classes'] = len(vocabulary) del self.decoder self.decoder = EncDecCTCModelBPE.from_config_dict(decoder_config) del self.loss self.loss = CTCLoss( num_classes=self.decoder.num_classes_with_blank - 1, zero_infinity=True, reduction=self._cfg.get("ctc_reduction", "mean_batch"), ) self._wer = WERBPE( tokenizer=self.tokenizer, batch_dim_index=0, use_cer=self._cfg.get('use_cer', False), ctc_decode=True, log_prediction=self._cfg.get("log_prediction", False), ) # Update config OmegaConf.set_struct(self._cfg.decoder, False) self._cfg.decoder = decoder_config OmegaConf.set_struct(self._cfg.decoder, True) logging.info( f"Changed tokenizer to {self.decoder.vocabulary} vocabulary.")