def __init__(self, cfg: DictConfig, trainer: Trainer = None): # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable self.global_rank = 0 self.world_size = 0 if trainer is not None: self.global_rank = (trainer.node_rank * trainer.num_gpus) + trainer.local_rank self.world_size = trainer.num_nodes * trainer.num_gpus super().__init__(cfg=cfg, trainer=trainer) self.preprocessor = EncDecCTCModel.from_config_dict( self._cfg.preprocessor) self.encoder = EncDecCTCModel.from_config_dict(self._cfg.encoder) self.decoder = EncDecCTCModel.from_config_dict(self._cfg.decoder) self.loss = CTCLoss(num_classes=self.decoder.num_classes_with_blank - 1, zero_infinity=True) if hasattr(self._cfg, 'spec_augment') and self._cfg.spec_augment is not None: self.spec_augmentation = EncDecCTCModel.from_config_dict( self._cfg.spec_augment) else: self.spec_augmentation = None # Setup metric objects self._wer = WER(vocabulary=self.decoder.vocabulary, batch_dim_index=0, use_cer=False, ctc_decode=True)
def __init__(self, cfg: DictConfig, trainer: Trainer = None): # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable self.global_rank = 0 self.world_size = 1 self.local_rank = 0 if trainer is not None: self.global_rank = (trainer.node_rank * trainer.num_gpus) + trainer.local_rank self.world_size = trainer.num_nodes * trainer.num_gpus self.local_rank = trainer.local_rank super().__init__(cfg=cfg, trainer=trainer) self.preprocessor = EncDecCTCModel.from_config_dict( self._cfg.preprocessor) self.encoder = EncDecCTCModel.from_config_dict(self._cfg.encoder) with open_dict(self._cfg): if "params" in self._cfg.decoder: if "feat_in" not in self._cfg.decoder.params or ( not self._cfg.decoder.params.feat_in and hasattr(self.encoder, '_feat_out')): self._cfg.decoder.params.feat_in = self.encoder._feat_out if "feat_in" not in self._cfg.decoder.params or not self._cfg.decoder.params.feat_in: raise ValueError( "param feat_in of the decoder's config is not set!") else: if "feat_in" not in self._cfg.decoder or ( not self._cfg.decoder.feat_in and hasattr(self.encoder, '_feat_out')): self._cfg.decoder.feat_in = self.encoder._feat_out if "feat_in" not in self._cfg.decoder or not self._cfg.decoder.feat_in: raise ValueError( "param feat_in of the decoder's config is not set!") self.decoder = EncDecCTCModel.from_config_dict(self._cfg.decoder) self.loss = CTCLoss( num_classes=self.decoder.num_classes_with_blank - 1, zero_infinity=True, reduction=self._cfg.get("ctc_reduction", "mean_batch"), ) if hasattr(self._cfg, 'spec_augment') and self._cfg.spec_augment is not None: self.spec_augmentation = EncDecCTCModel.from_config_dict( self._cfg.spec_augment) else: self.spec_augmentation = None # Setup metric objects self._wer = WER( vocabulary=self.decoder.vocabulary, batch_dim_index=0, use_cer=self._cfg.get('use_cer', False), ctc_decode=True, dist_sync_on_step=True, log_prediction=self._cfg.get("log_prediction", False), )
def __init__(self, cfg: DictConfig, trainer: Trainer = None): # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 self.world_size = 1 if trainer is not None: self.world_size = trainer.world_size super().__init__(cfg=cfg, trainer=trainer) self.preprocessor = EncDecCTCModel.from_config_dict( self._cfg.preprocessor) self.encoder = EncDecCTCModel.from_config_dict(self._cfg.encoder) with open_dict(self._cfg): if "feat_in" not in self._cfg.decoder or ( not self._cfg.decoder.feat_in and hasattr(self.encoder, '_feat_out')): self._cfg.decoder.feat_in = self.encoder._feat_out if "feat_in" not in self._cfg.decoder or not self._cfg.decoder.feat_in: raise ValueError( "param feat_in of the decoder's config is not set!") if self.cfg.decoder.num_classes < 1 and self.cfg.decoder.vocabulary is not None: logging.info( "\nReplacing placeholder number of classes ({}) with actual number of classes - {}" .format(self.cfg.decoder.num_classes, len(self.cfg.decoder.vocabulary))) cfg.decoder["num_classes"] = len(self.cfg.decoder.vocabulary) self.decoder = EncDecCTCModel.from_config_dict(self._cfg.decoder) self.loss = CTCLoss( num_classes=self.decoder.num_classes_with_blank - 1, zero_infinity=True, reduction=self._cfg.get("ctc_reduction", "mean_batch"), ) if hasattr(self._cfg, 'spec_augment') and self._cfg.spec_augment is not None: self.spec_augmentation = EncDecCTCModel.from_config_dict( self._cfg.spec_augment) else: self.spec_augmentation = None # Setup metric objects self._wer = WER( vocabulary=self.decoder.vocabulary, batch_dim_index=0, use_cer=self._cfg.get('use_cer', False), ctc_decode=True, dist_sync_on_step=True, log_prediction=self._cfg.get("log_prediction", False), ) # Setup optional Optimization flags self.setup_optimization_flags() # Adapter modules setup (from ASRAdapterModelMixin) self.setup_adapters()
def change_vocabulary(self, new_tokenizer_dir: str, new_tokenizer_type: str): """ Changes vocabulary of the tokenizer used during CTC decoding process. Use this method when fine-tuning on from pre-trained model. This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need model to learn capitalization, punctuation and/or special characters. Args: new_tokenizer_dir: Path to the new tokenizer directory. new_tokenizer_type: Either `bpe` or `wpe`. `bpe` is used for SentencePiece tokenizers, whereas `wpe` is used for `BertTokenizer`. Returns: None """ if not os.path.isdir(new_tokenizer_dir): raise NotADirectoryError( f'New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}' ) if new_tokenizer_type.lower() not in ('bpe', 'wpe'): raise ValueError(f'New tokenizer type must be either `bpe` or `wpe`') self.tokenizer_dir = new_tokenizer_dir # Remove tokenizer directory self.tokenizer_type = new_tokenizer_type.lower() # Remove tokenizer_type # Setup the tokenizer self._setup_tokenizer() # Initialize a dummy vocabulary vocabulary = self.tokenizer.tokenizer.get_vocab() # Set the new vocabulary decoder_config = copy.deepcopy(self.decoder.to_config_dict()) decoder_config.params.vocabulary = ListConfig(list(vocabulary.values())) # Override number of classes if placeholder provided logging.info( "\nReplacing old number of classes ({}) with new number of classes - {}".format( decoder_config['params']['num_classes'], len(vocabulary) ) ) decoder_config['params']['num_classes'] = len(vocabulary) del self.decoder self.decoder = EncDecCTCModelBPE.from_config_dict(decoder_config) del self.loss self.loss = CTCLoss(num_classes=self.decoder.num_classes_with_blank - 1, zero_infinity=True) self._wer = WERBPE(tokenizer=self.tokenizer, batch_dim_index=0, use_cer=False, ctc_decode=True) # Update config OmegaConf.set_struct(self._cfg.decoder, False) self._cfg.decoder = decoder_config OmegaConf.set_struct(self._cfg.decoder, True) logging.info(f"Changed tokenizer to {self.decoder.vocabulary} vocabulary.")
def change_vocabulary(self, new_vocabulary: List[str]): """ Changes vocabulary used during CTC decoding process. Use this method when fine-tuning on from pre-trained model. This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need model to learn capitalization, punctuation and/or special characters. If new_vocabulary == self.decoder.vocabulary then nothing will be changed. Args: new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically, \ this is target alphabet. Returns: None """ if self.decoder.vocabulary == new_vocabulary: logging.warning( f"Old {self.decoder.vocabulary} and new {new_vocabulary} match. Not changing anything." ) else: if new_vocabulary is None or len(new_vocabulary) == 0: raise ValueError( f'New vocabulary must be non-empty list of chars. But I got: {new_vocabulary}' ) decoder_config = self.decoder.to_config_dict() new_decoder_config = copy.deepcopy(decoder_config) new_decoder_config['vocabulary'] = new_vocabulary new_decoder_config['num_classes'] = len(new_vocabulary) del self.decoder self.decoder = EncDecCTCModel.from_config_dict(new_decoder_config) del self.loss self.loss = CTCLoss( num_classes=self.decoder.num_classes_with_blank - 1, zero_infinity=True, reduction=self._cfg.get("ctc_reduction", "mean_batch"), ) self._wer = WER( vocabulary=self.decoder.vocabulary, batch_dim_index=0, use_cer=self._cfg.get('use_cer', False), ctc_decode=True, dist_sync_on_step=True, log_prediction=self._cfg.get("log_prediction", False), ) # Update config OmegaConf.set_struct(self._cfg.decoder, False) self._cfg.decoder = new_decoder_config OmegaConf.set_struct(self._cfg.decoder, True) logging.info( f"Changed decoder to output to {self.decoder.vocabulary} vocabulary." )
def change_vocabulary(self, new_tokenizer_dir: Union[str, DictConfig], new_tokenizer_type: str): """ Changes vocabulary of the tokenizer used during CTC decoding process. Use this method when fine-tuning on from pre-trained model. This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need model to learn capitalization, punctuation and/or special characters. Args: new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer (if the tokenizer type is `agg`) new_tokenizer_type: Either `agg`, `bpe` or `wpe`. `bpe` is used for SentencePiece tokenizers, whereas `wpe` is used for `BertTokenizer`. new_tokenizer_cfg: A config for the new tokenizer. if provided, pre-empts the dir and type Returns: None """ if isinstance(new_tokenizer_dir, DictConfig): if new_tokenizer_type == 'agg': new_tokenizer_cfg = new_tokenizer_dir else: raise ValueError( f'New tokenizer dir should be a string unless the tokenizer is `agg`, but this tokenizer type is: {new_tokenizer_type}' ) else: new_tokenizer_cfg = None if new_tokenizer_cfg is not None: tokenizer_cfg = new_tokenizer_cfg else: if not os.path.isdir(new_tokenizer_dir): raise NotADirectoryError( f'New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}' f"New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}" ) if new_tokenizer_type.lower() not in ('bpe', 'wpe'): raise ValueError( f'New tokenizer type must be either `bpe` or `wpe`') tokenizer_cfg = OmegaConf.create({ 'dir': new_tokenizer_dir, 'type': new_tokenizer_type }) # Setup the tokenizer self._setup_tokenizer(tokenizer_cfg) # Initialize a dummy vocabulary vocabulary = self.tokenizer.tokenizer.get_vocab() # Set the new vocabulary decoder_config = copy.deepcopy(self.decoder.to_config_dict()) # sidestepping the potential overlapping tokens issue in aggregate tokenizers if self.tokenizer_type == "agg": decoder_config.vocabulary = ListConfig(vocabulary) else: decoder_config.vocabulary = ListConfig(list(vocabulary.keys())) decoder_num_classes = decoder_config['num_classes'] # Override number of classes if placeholder provided logging.info( "\nReplacing old number of classes ({}) with new number of classes - {}" .format(decoder_num_classes, len(vocabulary))) decoder_config['num_classes'] = len(vocabulary) del self.decoder self.decoder = EncDecCTCModelBPE.from_config_dict(decoder_config) del self.loss self.loss = CTCLoss( num_classes=self.decoder.num_classes_with_blank - 1, zero_infinity=True, reduction=self._cfg.get("ctc_reduction", "mean_batch"), ) self._wer = WERBPE( tokenizer=self.tokenizer, batch_dim_index=0, use_cer=self._cfg.get('use_cer', False), ctc_decode=True, log_prediction=self._cfg.get("log_prediction", False), ) # Update config OmegaConf.set_struct(self._cfg.decoder, False) self._cfg.decoder = decoder_config OmegaConf.set_struct(self._cfg.decoder, True) logging.info( f"Changed tokenizer to {self.decoder.vocabulary} vocabulary.")