def __init__( self, manifest_filepath, labels, featurizer, max_duration=None, min_duration=None, max_utts=0, blank_index=-1, unk_index=-1, normalize=True, trim=False, bos_id=None, eos_id=None, load_audio=True, parser='en', add_misc=False, ): self.collection = collections.ASRAudioText( manifests_files=manifest_filepath.split(','), parser=parsers.make_parser( labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize, ), min_duration=min_duration, max_duration=max_duration, max_number=max_utts, ) self.featurizer = featurizer self.trim = trim self.eos_id = eos_id self.bos_id = bos_id self.load_audio = load_audio self._add_misc = add_misc
def __init__(self, manifest_path: str, tar_filepaths: Union[str, List[str]], shuffle_n: int = 128): self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser( []), index_by_file_id=True) if isinstance(tar_filepaths, str): # Replace '(' and '[' with '{' brace_keys_open = ['(', '[', '<', '_OP_'] for bkey in brace_keys_open: if bkey in tar_filepaths: tar_filepaths = tar_filepaths.replace(bkey, "{") # Replace ')' and ']' with '}' brace_keys_close = [')', ']', '>', '_CL_'] for bkey in brace_keys_close: if bkey in tar_filepaths: tar_filepaths = tar_filepaths.replace(bkey, "}") self.audio_dataset = ( wd.Dataset(tar_filepaths).shuffle(shuffle_n).rename( audio='wav', key='__key__').to_tuple('audio', 'key')) self.audio_iter = iter(self.audio_dataset)
def __init__( self, manifest_path=None, min_snr_db=10, max_snr_db=50, max_gain_db=300.0, rng=None, audio_tar_filepaths=None, shuffle_n=100, orig_sr=16000, ): self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser( []), index_by_file_id=True) self._audiodataset = None self._tarred_audio = False self._orig_sr = orig_sr self._data_iterator = None if audio_tar_filepaths: self._tarred_audio = True self._audiodataset = AugmentationDataset(manifest_path, audio_tar_filepaths, shuffle_n) self._data_iterator = iter(self._audiodataset) self._rng = random.Random() if rng is None else rng self._min_snr_db = min_snr_db self._max_snr_db = max_snr_db self._max_gain_db = max_gain_db
def __init__( self, manifest_filepath: str, parser: Union[str, Callable], sample_rate: int, int_values: bool = False, augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None, max_duration: Optional[int] = None, min_duration: Optional[int] = None, max_utts: int = 0, trim: bool = False, bos_id: Optional[int] = None, eos_id: Optional[int] = None, pad_id: int = 0, ): self.parser = parser self.collection = collections.ASRAudioText( manifests_files=manifest_filepath.split(','), parser=parser, min_duration=min_duration, max_duration=max_duration, max_number=max_utts, ) self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor) self.trim = trim self.eos_id = eos_id self.bos_id = bos_id self.pad_id = pad_id
def __init__(self, manifest_path: str, tar_filepaths: Union[str, List[str]], shuffle_n: int = 128): self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser([]), index_by_file_id=True) if isinstance(tar_filepaths, str): # Replace '(' and '[' with '{' brace_keys_open = ['(', '[', '<', '_OP_'] for bkey in brace_keys_open: if bkey in tar_filepaths: tar_filepaths = tar_filepaths.replace(bkey, "{") # Replace ')' and ']' with '}' brace_keys_close = [')', ']', '>', '_CL_'] for bkey in brace_keys_close: if bkey in tar_filepaths: tar_filepaths = tar_filepaths.replace(bkey, "}") self.audio_dataset = wd.WebDataset(urls=tar_filepaths, nodesplitter=None) if shuffle_n > 0: self.audio_dataset = self.audio_dataset.shuffle(shuffle_n) else: logging.info("WebDataset will not shuffle files within the tar files.") self.audio_dataset = self.audio_dataset.rename(audio='wav', key='__key__').to_tuple('audio', 'key') self.audio_iter = iter(self.audio_dataset)
def __init__( self, manifest_filepath: Union[str, 'pathlib.Path'], n_segments: int, max_duration: Optional[float] = None, min_duration: Optional[float] = None, trim: Optional[bool] = False, truncate_to: Optional[int] = 1, ): """ See above AudioDataset for details on dataset and manifest formats. Unlike the regular AudioDataset, which samples random segments from each audio array as an example, SplicedAudioDataset concatenates all audio arrays together and indexes segments as examples. This way, the model sees more data (about 9x for LJSpeech) per epoch. Note: this class is not recommended to be used in validation. Args: manifest_filepath (str, Path): Path to manifest json as described above. Can be comma-separated paths such as "train_1.json,train_2.json" which is treated as two separate json files. n_segments (int): The length of audio in samples to load. For example, given a sample rate of 16kHz, and n_segments=16000, a random 1 second section of audio from the clip will be loaded. The section will be randomly sampled everytime the audio is batched. Can be set to -1 to load the entire audio. max_duration (float): If audio exceeds this length in seconds, it is filtered from the dataset. Defaults to None, which does not filter any audio. min_duration(float): If audio is less than this length in seconds, it is filtered from the dataset. Defaults to None, which does not filter any audio. trim (bool): Whether to use librosa.effects.trim on the audio clip truncate_to (int): Ensures that the audio segment returned is a multiple of truncate_to. Defaults to 1, which does no truncating. """ assert n_segments > 0 collection = collections.ASRAudioText( manifests_files=manifest_filepath.split(','), parser=parsers.make_parser(), min_duration=min_duration, max_duration=max_duration, ) self.trim = trim self.n_segments = n_segments self.truncate_to = truncate_to self.samples = [] for index in range(len(collection)): example = collection[index] with sf.SoundFile(example.audio_file, 'r') as f: samples = f.read(dtype='float32').transpose() self.samples.append(samples) self.samples = np.concatenate(self.samples, axis=0) self.samples = self.samples[:self.samples.shape[0] - (self.samples.shape[0] % self.n_segments), ...]
def __init__(self, manifest_path=None, rng=None, audio_tar_filepaths=None, shuffle_n=128, shift_impulse=False): self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser([]), index_by_file_id=True) self._audiodataset = None self._tarred_audio = False self._shift_impulse = shift_impulse self._data_iterator = None if audio_tar_filepaths: self._tarred_audio = True self._audiodataset = AugmentationDataset(manifest_path, audio_tar_filepaths, shuffle_n) self._data_iterator = iter(self._audiodataset) self._rng = random.Random() if rng is None else rng
def __init__( self, manifest_path=None, min_snr_db=40, max_snr_db=50, max_gain_db=300.0, rng=None, ): self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser( [])) self._rng = random.Random() if rng is None else rng self._min_snr_db = min_snr_db self._max_snr_db = max_snr_db self._max_gain_db = max_gain_db
def __init__( self, manifest_filepath, n_segments=0, max_duration=None, min_duration=None, trim=False, ): """See AudioDataLayer""" self.collection = collections.ASRAudioText( manifests_files=manifest_filepath.split(','), parser=parsers.make_parser(), min_duration=min_duration, max_duration=max_duration, ) self.trim = trim self.n_segments = n_segments
def __init__( self, manifest_filepath: Union[str, "pathlib.Path"], n_segments: int, max_duration: Optional[float] = None, min_duration: Optional[float] = None, trim: Optional[bool] = False, truncate_to: Optional[int] = 1, ): """ Mostly compliant with nemo.collections.asr.data.datalayers.AudioToTextDataset except it only returns Audio without text. Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations (in seconds). Each new line is a different sample. Note that text is required, but is ignored for AudioDataset. Example below: {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147} ... {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt": "utterance_id", "ctm_utt": "en_4156", "side": "A"} Args: manifest_filepath (str, Path): Path to manifest json as described above. Can be comma-separated paths such as "train_1.json,train_2.json" which is treated as two separate json files. n_segments (int): The length of audio in samples to load. For example, given a sample rate of 16kHz, and n_segments=16000, a random 1 second section of audio from the clip will be loaded. The section will be randomly sampled everytime the audio is batched. Can be set to -1 to load the entire audio. max_duration (float): If audio exceeds this length in seconds, it is filtered from the dataset. Defaults to None, which does not filter any audio. min_duration(float): If audio is less than this length in seconds, it is filtered from the dataset. Defaults to None, which does not filter any audio. trim (bool): Whether to use librosa.effects.trim on the audio clip truncate_to (int): Ensures that the audio segment returned is a multiple of truncate_to. Defaults to 1, which does no truncating. """ self.collection = collections.ASRAudioText( manifests_files=manifest_filepath.split(","), parser=parsers.make_parser(), min_duration=min_duration, max_duration=max_duration, ) self.trim = trim self.n_segments = n_segments self.truncate_to = truncate_to
def __init__( self, audio_tar_filepaths: Union[str, List[str]], manifest_filepath: str, parser: Callable, sample_rate: int, int_values: bool = False, augmentor: Optional[ 'nemo.collections.asr.parts.perturb.AudioAugmentor'] = None, shuffle_n: int = 0, min_duration: Optional[float] = None, max_duration: Optional[float] = None, max_utts: int = 0, trim: bool = False, bos_id: Optional[int] = None, eos_id: Optional[int] = None, add_misc: bool = False, pad_id: int = 0, shard_strategy: str = "scatter", global_rank: int = 0, world_size: int = 0, ): self.collection = collections.ASRAudioText( manifests_files=manifest_filepath.split(','), parser=parser, min_duration=min_duration, max_duration=max_duration, max_number=max_utts, index_by_file_id= True, # Must set this so the manifest lines can be indexed by file ID ) self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor) self.trim = trim self.eos_id = eos_id self.bos_id = bos_id self.pad_id = pad_id self._add_misc = add_misc valid_shard_strategies = ['scatter', 'replicate'] if shard_strategy not in valid_shard_strategies: raise ValueError( f"`shard_strategy` must be one of {valid_shard_strategies}") if isinstance(audio_tar_filepaths, str): # Replace '(' and '[' with '{' brace_keys_open = ['(', '[', '<', '_OP_'] for bkey in brace_keys_open: if bkey in audio_tar_filepaths: audio_tar_filepaths = audio_tar_filepaths.replace( bkey, "{") # Replace ')' and ']' with '}' brace_keys_close = [')', ']', '>', '_CL_'] for bkey in brace_keys_close: if bkey in audio_tar_filepaths: audio_tar_filepaths = audio_tar_filepaths.replace( bkey, "}") # Check for distributed and partition shards accordingly if world_size > 1: if isinstance(audio_tar_filepaths, str): # Brace expand audio_tar_filepaths = list( braceexpand.braceexpand(audio_tar_filepaths)) if shard_strategy == 'scatter': logging.info( "All tarred dataset shards will be scattered evenly across all nodes." ) if len(audio_tar_filepaths) % world_size != 0: logging.warning( f"Number of shards in tarred dataset ({len(audio_tar_filepaths)}) is not divisible " f"by number of distributed workers ({world_size}).") begin_idx = (len(audio_tar_filepaths) // world_size) * global_rank end_idx = begin_idx + (len(audio_tar_filepaths) // world_size) audio_tar_filepaths = audio_tar_filepaths[begin_idx:end_idx] logging.info( "Partitioning tarred dataset: process (%d) taking shards [%d, %d)", global_rank, begin_idx, end_idx) elif shard_strategy == 'replicate': logging.info( "All tarred dataset shards will be replicated across all nodes." ) else: raise ValueError( f"Invalid shard strategy ! Allowed values are : {valid_shard_strategies}" ) # Put together WebDataset self._dataset = ( wd.Dataset(audio_tar_filepaths).shuffle(shuffle_n).rename( audio='wav', key='__key__').to_tuple('audio', 'key').pipe( self._filter).map(f=self._build_sample))
def __init__(self, manifest_path=None, rng=None): self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser( [])) self._rng = random.Random() if rng is None else rng
def test_transcript_normalizers(self): # Create test json test_strings = [ "TEST CAPITALIZATION", '!\\"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~', "3+3=10", "3 + 3 = 10", "why is \\t whitepsace\\tsuch a problem why indeed", "\\\"Can you handle quotes?,\\\" says the boy", "I Jump!!!!With joy?Now.", "Maybe I want to learn periods.", "$10 10.90 1-800-000-0000", "18000000000 one thousand 2020", "1 10 100 1000 10000 100000 1000000", "Î ĻƠvɆȩȅĘ ÀÁÃ Ą ÇĊňńŤŧș", "‘’“”❛❜❝❞「 」 〈 〉 《 》 【 】 〔 〕 ⦗ ⦘ 😙 👀 🔨", "It only costs $1 000 000! Cheap right?", "2500, 3000 are separate but 200, 125 is not", "1", "1 2", "1 2 3", "10:00pm is 10:00 pm is 22:00 but not 10: 00 pm", "10:00 10:01pm 10:10am 10:90pm", "Mr. Expand me!", "Mr Don't Expand me!", ] normalized_strings = [ "test capitalization", 'percent and \' plus', "three plus three ten", "three plus three ten", "why is whitepsace such a problem why indeed", "can you handle quotes says the boy", "i jump with joy now", "maybe i want to learn periods", "ten dollars ten point nine zero one eight hundred zero zero", "eighteen billion one thousand two thousand and twenty", # Two line string below "one ten thousand one hundred one thousand ten thousand one " "hundred thousand one million", "i loveeee aaa a ccnntts", "''", "it only costs one million dollars cheap right", # Two line string below "two thousand five hundred three thousand are separate but two " "hundred thousand one hundred and twenty five is not", "one", "one two", "one two three", "ten pm is ten pm is twenty two but not ten zero pm", "ten ten one pm ten ten am ten ninety pm", "mister expand me", "mr don't expand me", ] manifest_paths = os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/asr/manifest_test.json")) def remove_test_json(): os.remove(manifest_paths) self.addCleanup(remove_test_json) with open(manifest_paths, "w") as f: for s in test_strings: f.write('{"audio_filepath": "", "duration": 1.0, "text": ' f'"{s}"}}\n') parser = parsers.make_parser(self.labels, 'en') manifest = collections.ASRAudioText( manifests_files=[manifest_paths], parser=parser, ) for i, s in enumerate(normalized_strings): self.assertTrue(manifest[i].text_tokens == parser(s))