def test_tarred_bpe_dataset(self, test_data_dir): manifest_path = os.path.abspath(os.path.join(test_data_dir, 'asr/tarred_an4/tarred_audio_manifest.json')) tokenizer_path = os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128", 'vocab.txt') tokenizer = tokenizers.AutoTokenizer(pretrained_model_name='bert-base-cased', vocab_file=tokenizer_path) # Test braceexpand loading tarpath = os.path.abspath(os.path.join(test_data_dir, 'asr/tarred_an4/audio_{0..1}.tar')) ds_braceexpand = TarredAudioToBPEDataset( audio_tar_filepaths=tarpath, manifest_filepath=manifest_path, tokenizer=tokenizer, sample_rate=16000 ) assert len(ds_braceexpand) == 32 count = 0 for _ in ds_braceexpand: count += 1 assert count == 32 # Test loading via list tarpath = [os.path.abspath(os.path.join(test_data_dir, f'asr/tarred_an4/audio_{i}.tar')) for i in range(2)] ds_list_load = TarredAudioToBPEDataset( audio_tar_filepaths=tarpath, manifest_filepath=manifest_path, tokenizer=tokenizer, sample_rate=16000 ) count = 0 for _ in ds_list_load: count += 1 assert count == 32
def _setup_tokenizer(self): if self.tokenizer_type not in ['bpe', 'wpe']: raise ValueError( "`tokenizer.type` must be either `bpe` for SentencePiece tokenizer or " "`wpe` for BERT based tokenizer") if self.tokenizer_type == 'bpe': # This is a BPE Tokenizer model_path = os.path.join(self.tokenizer_dir, 'tokenizer.model') model_path = self.register_artifact('tokenizer.model_path', model_path) self.model_path = model_path if 'special_tokens' in self.tokenizer_cfg: special_tokens = self.tokenizer_cfg['special_tokens'] else: special_tokens = None # Update special tokens self.tokenizer = tokenizers.SentencePieceTokenizer( model_path=model_path, special_tokens=special_tokens) vocab_path = os.path.join(self.tokenizer_dir, 'vocab.txt') vocab_path = self.register_artifact('tokenizer.vocab_path', vocab_path) self.vocab_path = vocab_path vocabulary = {0: '<unk>'} with open(vocab_path) as f: for i, piece in enumerate(f): piece = piece.replace('\n', '') vocabulary[i + 1] = piece # wrapper method to get vocabulary conveniently def get_vocab(): return vocabulary # attach utility values to the tokenizer wrapper self.tokenizer.tokenizer.vocab_size = len(vocabulary) self.tokenizer.tokenizer.get_vocab = get_vocab self.tokenizer.tokenizer.all_special_tokens = self.tokenizer.special_token_to_id else: # This is a WPE Tokenizer vocab_path = os.path.join(self.tokenizer_dir, 'vocab.txt') self.tokenizer_dir = self.register_artifact( 'tokenizer.vocab_path', vocab_path) self.vocab_path = self.tokenizer_dir self.tokenizer = tokenizers.AutoTokenizer( pretrained_model_name='bert-base-cased', vocab_file=self.tokenizer_dir, **self.tokenizer_cfg) logging.info("Tokenizer {} initialized with {} tokens".format( self.tokenizer.__class__.__name__, self.tokenizer.vocab_size))
def _setup_tokenizer(self, tokenizer_cfg: DictConfig): # Prevent tokenizer parallelism (unless user has explicitly set it) if 'TOKENIZERS_PARALLELISM' not in os.environ: os.environ['TOKENIZERS_PARALLELISM'] = 'false' self.tokenizer_cfg = OmegaConf.to_container(tokenizer_cfg, resolve=True) # type: dict self.tokenizer_dir = self.tokenizer_cfg.pop( 'dir') # Remove tokenizer directory self.tokenizer_type = self.tokenizer_cfg.pop( 'type').lower() # Remove tokenizer_type if self.tokenizer_type not in ['bpe', 'wpe']: raise ValueError( "`tokenizer.type` must be either `bpe` for SentencePiece tokenizer or " "`wpe` for BERT based tokenizer") if self.tokenizer_type == 'bpe': # This is a BPE Tokenizer model_path = os.path.join(self.tokenizer_dir, 'tokenizer.model') model_path = self.register_artifact('tokenizer.model_path', model_path) self.model_path = model_path if 'special_tokens' in self.tokenizer_cfg: special_tokens = self.tokenizer_cfg['special_tokens'] else: special_tokens = None # Update special tokens self.tokenizer = tokenizers.SentencePieceTokenizer( model_path=model_path, special_tokens=special_tokens) vocab_path = os.path.join(self.tokenizer_dir, 'vocab.txt') vocab_path = self.register_artifact('tokenizer.vocab_path', vocab_path) self.vocab_path = vocab_path try: spe_vocab_path = os.path.join(self.tokenizer_dir, 'tokenizer.vocab') spe_vocab_path = self.register_artifact( 'spe_tokenizer.vocab', spe_vocab_path) self.spe_vocab_path = spe_vocab_path except FileNotFoundError: # fallback case for older checkpoints that did not preserve the tokenizer.vocab self.spe_vocab_path = None vocabulary = {'<unk>': 0} with open(vocab_path) as f: for i, piece in enumerate(f): piece = piece.replace('\n', '') vocabulary[piece] = i + 1 # wrapper method to get vocabulary conveniently def get_vocab(): return vocabulary # attach utility values to the tokenizer wrapper self.tokenizer.tokenizer.vocab_size = len(vocabulary) self.tokenizer.tokenizer.get_vocab = get_vocab self.tokenizer.tokenizer.all_special_tokens = self.tokenizer.special_token_to_id else: # This is a WPE Tokenizer vocab_path = os.path.join(self.tokenizer_dir, 'vocab.txt') self.tokenizer_dir = self.register_artifact( 'tokenizer.vocab_path', vocab_path) self.vocab_path = self.tokenizer_dir self.tokenizer = tokenizers.AutoTokenizer( pretrained_model_name='bert-base-cased', vocab_file=self.tokenizer_dir, **self.tokenizer_cfg) logging.info("Tokenizer {} initialized with {} tokens".format( self.tokenizer.__class__.__name__, self.tokenizer.vocab_size))
def test_dali_bpe_dataset(self, test_data_dir): manifest_path = os.path.abspath(os.path.join(test_data_dir, 'asr/an4_val.json')) num_samples = 10 batch_size = 2 device = 'gpu' if torch.cuda.is_available() else 'cpu' texts = [] tokenizer_path = os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128", 'vocab.txt') tokenizer = tokenizers.AutoTokenizer(pretrained_model_name='bert-base-cased', vocab_file=tokenizer_path) with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8') as f: with open(manifest_path, 'r') as m: for ix, line in enumerate(m): if ix >= num_samples: break line = line.replace("tests/data/", "tests/.data/").replace("\n", "") f.write(f"{line}\n") data = json.loads(line) texts.append(data['text']) f.seek(0) dataset = AudioToBPEDALIDataset( manifest_filepath=f.name, tokenizer=tokenizer, device=device, batch_size=batch_size, max_duration=16.0, shuffle=False, ) assert len(dataset) == (num_samples // batch_size) # num batches count = 0 original_transcripts = [] for batch in dataset: transcripts = batch[2] # transcript index in DALIOutputs transcripts_lengths = batch[3] # transcript length index in DALIOutputs transcripts = [ decode_subwords(transcript, transcripts_length, tokenizer=tokenizer) for transcript, transcripts_length in zip(transcripts, transcripts_lengths) ] original_transcripts.extend(transcripts) count += len(transcripts) assert count == num_samples # Assert transcripts are correct for text, og_transcript in zip(texts, original_transcripts): assert text == og_transcript # Repeat, now with shuffle enabled f.seek(0) dataset = AudioToBPEDALIDataset( manifest_filepath=f.name, tokenizer=tokenizer, device=device, batch_size=batch_size, max_duration=16.0, shuffle=True, ) assert len(dataset) == (num_samples // batch_size) # num batches count = 0 shuffled_transcripts = [] for batch in dataset: transcripts = batch[2] # transcript index in DALIOutputs transcripts_lengths = batch[3] # transcript length index in DALIOutputs transcripts = [ decode_subwords(transcript, transcripts_length, tokenizer=tokenizer) for transcript, transcripts_length in zip(transcripts, transcripts_lengths) ] shuffled_transcripts.extend(transcripts) count += len(transcripts) assert count == num_samples samples_changed = 0 for orig, shuffled in zip(original_transcripts, shuffled_transcripts): if orig != shuffled: samples_changed += 1 assert samples_changed > 1 # assume after shuffling at least 1 sample was displaced
def _setup_tokenizer(self, tokenizer_cfg: DictConfig): # Prevent tokenizer parallelism (unless user has explicitly set it) if 'TOKENIZERS_PARALLELISM' not in os.environ: os.environ['TOKENIZERS_PARALLELISM'] = 'false' self.tokenizer_cfg = OmegaConf.to_container(tokenizer_cfg, resolve=True) # type: dict self.tokenizer_dir = self.tokenizer_cfg.pop( 'dir') # Remove tokenizer directory self.tokenizer_type = self.tokenizer_cfg.pop( 'type').lower() # Remove tokenizer_type self.hf_tokenizer_kwargs = self.tokenizer_cfg.pop( "hf_kwargs", {}) # Remove HF tokenizer kwargs # Preserve config if hasattr(self, 'cfg') and 'tokenizer' in self.cfg: self.cfg.tokenizer.dir = self.tokenizer_dir self.cfg.tokenizer.type = self.tokenizer_type if 'hf_kwargs' in tokenizer_cfg: with open_dict(self.cfg.tokenizer): self.cfg.tokenizer.hf_kwargs = tokenizer_cfg.get( 'hf_kwargs') if self.tokenizer_type not in ['bpe', 'wpe']: raise ValueError( "`tokenizer.type` must be either `bpe` for SentencePiece tokenizer or " "`wpe` for BERT based tokenizer") if self.tokenizer_type == 'bpe': # This is a BPE Tokenizer if 'model_path' in self.tokenizer_cfg: model_path = self.tokenizer_cfg.get('model_path') else: model_path = os.path.join(self.tokenizer_dir, 'tokenizer.model') model_path = self.register_artifact('tokenizer.model_path', model_path) self.model_path = model_path if 'special_tokens' in self.tokenizer_cfg: special_tokens = self.tokenizer_cfg['special_tokens'] if special_tokens is not None: raise ValueError( "`special_tokens` are no longer supported for SentencePiece based tokenizers." ) # Update special tokens self.tokenizer = tokenizers.SentencePieceTokenizer( model_path=model_path) if 'vocab_path' in self.tokenizer_cfg: vocab_path = self.tokenizer_cfg.get('vocab_path') else: vocab_path = os.path.join(self.tokenizer_dir, 'vocab.txt') vocab_path = self.register_artifact('tokenizer.vocab_path', vocab_path) self.vocab_path = vocab_path try: if 'spe_tokenizer_vocab' in self.tokenizer_cfg: spe_vocab_path = self.tokenizer_cfg.get( 'spe_tokenizer_vocab') else: spe_vocab_path = os.path.join(self.tokenizer_dir, 'tokenizer.vocab') spe_vocab_path = self.register_artifact( 'tokenizer.spe_tokenizer_vocab', spe_vocab_path) self.spe_vocab_path = spe_vocab_path except FileNotFoundError: # fallback case for older checkpoints that did not preserve the tokenizer.vocab self.spe_vocab_path = None vocabulary = {} for i in range(self.tokenizer.vocab_size): piece = self.tokenizer.ids_to_tokens([i]) piece = piece[0] vocabulary[piece] = i + 1 # wrapper method to get vocabulary conveniently def get_vocab(): return vocabulary # attach utility values to the tokenizer wrapper self.tokenizer.tokenizer.vocab_size = len(vocabulary) self.tokenizer.tokenizer.get_vocab = get_vocab self.tokenizer.tokenizer.all_special_tokens = self.tokenizer.special_token_to_id else: # This is a WPE Tokenizer # If path from previous registration exists, remove it if 'vocab_path' in self.tokenizer_cfg: vocab_path = self.tokenizer_cfg.get('vocab_path') else: vocab_path = os.path.join(self.tokenizer_dir, 'vocab.txt') vocab_path = self.register_artifact('tokenizer.vocab_path', vocab_path) self.vocab_path = vocab_path # If path from previous registration exists, remove it if 'vocab_path' in self.tokenizer_cfg: self.tokenizer_cfg.pop('vocab_path') self.tokenizer = tokenizers.AutoTokenizer( pretrained_model_name='bert-base-cased', vocab_file=self.vocab_path, mask_token=self.hf_tokenizer_kwargs.get('mask_token', None), bos_token=self.hf_tokenizer_kwargs.get('bos_token', None), eos_token=self.hf_tokenizer_kwargs.get('eos_token', None), pad_token=self.hf_tokenizer_kwargs.get('pad_token', None), sep_token=self.hf_tokenizer_kwargs.get('sep_token', None), cls_token=self.hf_tokenizer_kwargs.get('cls_token', None), unk_token=self.hf_tokenizer_kwargs.get('unk_token', None), use_fast=self.hf_tokenizer_kwargs.get('use_fast', False), ) logging.info("Tokenizer {} initialized with {} tokens".format( self.tokenizer.__class__.__name__, self.tokenizer.vocab_size))
def _make_tokenizer(self, tokenizer_cfg: DictConfig, lang=None): tokenizer_type = tokenizer_cfg.get('type').lower() tokenizer_dir = tokenizer_cfg.get('dir') if tokenizer_type not in ['bpe', 'wpe']: raise ValueError( '`tokenizer.type` must be either `bpe` for SentencePiece tokenizer or' '`wpe` for BERT based tokenizer') # defaults model_path = None vocab_path = None spe_vocab_path = None if tokenizer_type == 'bpe': # This is a BPE Tokenizer if 'model_path' in tokenizer_cfg: model_path = tokenizer_cfg.get('model_path') else: model_path = os.path.join(tokenizer_dir, 'tokenizer.model') model_path = self.register_artifact( 'tokenizer.' + self.AGGREGATE_TOKENIZERS_DICT_PREFIX + '.' + lang + '.model_path', model_path) if 'special_tokens' in tokenizer_cfg: special_tokens = tokenizer_cfg['special_tokens'] if special_tokens is not None: raise ValueError( '`special_tokens` are no longer supported for SentencePiece based tokenizers.' ) # Update special tokens tokenizer = tokenizers.SentencePieceTokenizer( model_path=model_path) if 'vocab_path' in tokenizer_cfg: vocab_path = tokenizer_cfg.get('vocab_path') else: vocab_path = os.path.join(tokenizer_dir, 'vocab.txt') vocab_path = self.register_artifact( 'tokenizer.' + self.AGGREGATE_TOKENIZERS_DICT_PREFIX + '.' + lang + '.vocab_path', vocab_path) try: if 'spe_tokenizer_vocab' in tokenizer_cfg: spe_vocab_path = tokenizer_cfg.get('spe_tokenizer_vocab') else: spe_vocab_path = os.path.join(tokenizer_dir, 'tokenizer.vocab') spe_vocab_path = self.register_artifact( 'tokenizer.' + self.AGGREGATE_TOKENIZERS_DICT_PREFIX + '.' + lang + '.spe_tokenizer_vocab', spe_vocab_path, ) except FileNotFoundError: # fallback case for older checkpoints that did not preserve the tokenizer.vocab spe_vocab_path = None vocabulary = {} for i in range(tokenizer.vocab_size): piece = tokenizer.ids_to_tokens([i]) piece = piece[0] vocabulary[piece] = i + 1 # wrapper method to get vocabulary conveniently def get_vocab(): return vocabulary # attach utility values to the tokenizer wrapper tokenizer.tokenizer.vocab_size = len(vocabulary) tokenizer.tokenizer.get_vocab = get_vocab tokenizer.tokenizer.all_special_tokens = tokenizer.special_token_to_id else: # This is a WPE Tokenizer # If path from previous registration exists, remove it if 'vocab_path' in tokenizer_cfg: vocab_path = tokenizer_cfg.get('vocab_path') else: vocab_path = os.path.join(tokenizer_dir, 'vocab.txt') vocab_path = self.register_artifact( 'tokenizer.' + self.AGGREGATE_TOKENIZERS_DICT_PREFIX + '.' + lang + '.vocab_path', vocab_path) # If path from previous registration exists, remove it if 'vocab_path' in tokenizer_cfg: tokenizer_cfg.pop('vocab_path') hf_tokenizer_kwargs = tokenizer_cfg.get('hf_kwargs', {}) tokenizer = tokenizers.AutoTokenizer( pretrained_model_name='bert-base-cased', vocab_file=vocab_path, mask_token=hf_tokenizer_kwargs.get('mask_token', None), bos_token=hf_tokenizer_kwargs.get('bos_token', None), eos_token=hf_tokenizer_kwargs.get('eos_token', None), pad_token=hf_tokenizer_kwargs.get('pad_token', None), sep_token=hf_tokenizer_kwargs.get('sep_token', None), cls_token=hf_tokenizer_kwargs.get('cls_token', None), unk_token=hf_tokenizer_kwargs.get('unk_token', None), use_fast=hf_tokenizer_kwargs.get('use_fast', False), ) logging.info('Tokenizer {} initialized with {} tokens'.format( tokenizer.__class__.__name__, tokenizer.vocab_size)) return tokenizer, model_path, vocab_path, spe_vocab_path
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="""Create token LM for input manifest and tokenizer.""", ) parser.add_argument( "--manifest", required=True, type=str, help="Comma separated list of manifest files", ) parser.add_argument( "--tokenizer_dir", required=True, type=str, help= "The directory path to the tokenizer vocabulary + additional metadata", ) parser.add_argument( "--tokenizer_type", required=True, type=str, choices=["bpe", "wpe"], help="The type of the tokenizer. Currently supports `bpe` and `wpe`", ) parser.add_argument( "--lm_builder", default="chain-est-phone-lm", type=str, help= ("The path or name of an LM builder. Supported builders: chain-est-phone-lm " "and scripts/asr_language_modeling/ngram_lm/make_phone_lm.py"), ) parser.add_argument( "--ngram_order", type=int, default=2, choices=[2, 3, 4, 5], help="Order of n-gram to use", ) parser.add_argument( "--output_file", required=True, type=str, help="The path to store the token LM", ) parser.add_argument( "--do_lowercase", action="store_true", help="Whether to apply lower case conversion on the text", ) args = parser.parse_args() is_chain_builder = Path(args.lm_builder).stem == "chain-est-phone-lm" """ TOKENIZER SETUP """ logging.info( f"Loading {args.tokenizer_type} tokenizer from '{args.tokenizer_dir}' ..." ) if args.tokenizer_type == "bpe": # This is a BPE Tokenizer model_path = os.path.join(args.tokenizer_dir, "tokenizer.model") # Update special tokens tokenizer = tokenizers.SentencePieceTokenizer(model_path=model_path) else: # This is a WPE Tokenizer vocab_path = os.path.join(args.tokenizer_dir, "vocab.txt") tokenizer = tokenizers.AutoTokenizer( pretrained_model_name="bert-base-cased", vocab_file=vocab_path) logging.info( f"Tokenizer {tokenizer.__class__.__name__} loaded with {tokenizer.vocab_size} tokens" ) """ DATA PROCESSING """ if "," in args.manifest: manifests = args.manifest.split(",") else: manifests = [args.manifest] offset = 1 # tokens in token LM cannot be 0 tok_text_list = [] num_lines = 0 for manifest in manifests: logging.info(f"Processing manifest : {manifest} ...") with open(manifest, "r") as in_reader: for line in in_reader: item = json.loads(line) text = item["text"] if args.do_lowercase: text = text.lower() tok_text = " ".join( [str(i + offset) for i in tokenizer.text_to_ids(text)]) if is_chain_builder: tok_text = f"line_{num_lines} " + tok_text tok_text_list.append(tok_text) num_lines += 1 tok_texts = "\n".join(tok_text_list) del tok_text_list logging.info( "Finished processing all manifests ! Number of sentences : {}".format( num_lines)) """ LM BUILDING """ logging.info(f"Calling {args.lm_builder} ...") if is_chain_builder: pipe_args = [ args.lm_builder, f"--ngram-order={args.ngram_order}", f"--no-prune-ngram-order={args.ngram_order}", "ark:-", "-", ] p1 = Popen(pipe_args, stdin=PIPE, stdout=PIPE, text=True) p2 = Popen(["fstprint"], stdin=p1.stdout, stdout=PIPE, text=True) p1.stdout.close() p1.stdout = None Thread(target=p1.communicate, args=[tok_texts]).start() out, err = p2.communicate() else: pipe_args = [ args.lm_builder, f"--ngram-order={args.ngram_order}", f"--no-backoff-ngram-order={args.ngram_order}", "--phone-disambig-symbol=-11", ] p1 = Popen(pipe_args, stdout=PIPE, stdin=PIPE, text=True) out, err = p1.communicate(tok_texts) logging.info(f"LM is built, writing to {args.output_file} ...") with open(args.output_file, "w", encoding="utf-8") as f: f.write(out) logging.info(f"Done writing to '{args.output_file}'.")