def __init__(self, params, model, num_workers=1, worker_id=0): super(TransformerDataLayer, self).__init__(params, model, num_workers, worker_id) self.src_vocab_file = self.params['src_vocab_file'] # if tgt vocab isn't specified - assume common vocab file self.tgt_vocab_file = self.params.get('tgt_vocab_file', self.src_vocab_file) # load source and target vocabularies to RAM # pre-processed vocab starts from PAD, EOS self.src_seq2idx = load_pre_existing_vocabulary(self.src_vocab_file, min_idx=PAD_ID) self.tgt_seq2idx = load_pre_existing_vocabulary(self.tgt_vocab_file, min_idx=PAD_ID) self.src_idx2seq = {idx: w for w, idx in self.src_seq2idx.items()} self.tgt_idx2seq = {idx: w for w, idx in self.tgt_seq2idx.items()} self.params['src_vocab_size'] = len(self.src_seq2idx) self.params['tgt_vocab_size'] = len(self.tgt_seq2idx) self.params['target_seq2idx'] = self.tgt_seq2idx self.params['source_seq2idx'] = self.src_seq2idx self.params['target_idx2seq'] = self.tgt_idx2seq self.params['source_idx2seq'] = self.src_idx2seq self._num_workers = num_workers self._worker_id = worker_id self._input_tensors = {} self._iterator = None self.batched_dataset = None
def __init__(self, params, model, num_workers, worker_id): """Speech-to-text data layer constructor. See parent class for arguments description. Config parameters: * **num_audio_features** (int) --- number of audio features to extract. * **input_type** (str) --- could be either "spectrogram" or "mfcc". * **vocab_file** (str) --- path to vocabulary file or sentencepiece model. * **dataset_files** (list) --- list with paths to all dataset .csv files. * **augmentation** (dict) --- optional dictionary with data augmentation parameters. Can contain "time_stretch_ratio", "noise_level_min" and "noise_level_max" parameters, e.g.:: { 'time_stretch_ratio': 0.05, 'noise_level_min': -90, 'noise_level_max': -60, } For additional details on these parameters see :func:`data.speech2text.speech_utils.augment_audio_signal` function. * **autoregressive** (bool) --- boolean indicating whether the model is autoregressive. """ super(Speech2TextDataLayer, self).__init__(params, model, num_workers, worker_id) self.params['autoregressive'] = self.params.get( 'autoregressive', False) self.autoregressive = self.params['autoregressive'] self.params['bpe'] = self.params.get('bpe', False) if self.params['bpe']: self.sp = spm.SentencePieceProcessor() self.sp.Load(self.params['vocab_file']) self.params['tgt_vocab_size'] = len(self.sp) + 1 else: self.params['char2idx'] = load_pre_existing_vocabulary( self.params['vocab_file'], read_chars=True, ) if not self.autoregressive: # add one for implied blank token self.params['tgt_vocab_size'] = len( self.params['char2idx']) + 1 else: num_chars_orig = len(self.params['char2idx']) self.params['tgt_vocab_size'] = num_chars_orig + 2 self.start_index = num_chars_orig self.end_index = num_chars_orig + 1 self.params['char2idx']['<S>'] = self.start_index self.params['char2idx']['</S>'] = self.end_index self.target_pad_value = self.end_index self.params['idx2char'] = { i: w for w, i in self.params['char2idx'].items() } self.target_pad_value = 0 self._files = None if self.params["interactive"]: return for csv in params['dataset_files']: files = pd.read_csv(csv, encoding='utf-8') if self._files is None: self._files = files else: self._files = self._files.append(files) if self.params['mode'] != 'infer': cols = ['wav_filename', 'transcript'] else: cols = 'wav_filename' self.all_files = self._files.loc[:, cols].values self._files = self.split_data(self.all_files) self._size = self.get_size_in_samples() self._dataset = None self._iterator = None self._input_tensors = None self.params['max_duration'] = params.get('max_duration', None)
def __init__(self, params, model, num_workers=None, worker_id=None): """Text-to-speech data layer constructor. See parent class for arguments description. Config parameters: * **dataset** (str) --- The dataset to use. Currently 'LJ' for the LJSpeech 1.1 dataset is supported. * **num_audio_features** (int) --- number of audio features to extract. * **output_type** (str) --- could be either "magnitude", or "mel". * **vocab_file** (str) --- path to vocabulary file. * **dataset_files** (list) --- list with paths to all dataset .csv files. File is assumed to be separated by "|". * **dataset_location** (string) --- string with path to directory where wavs are stored. * **feature_normalize** (bool) --- whether to normlize the data with a preset mean and std * **feature_normalize_mean** (bool) --- used for feature normalize. Defaults to 0. * **feature_normalize_std** (bool) --- used for feature normalize. Defaults to 1. * **mag_power** (int) --- the power to which the magnitude spectrogram is scaled to. Defaults to 1. 1 for energy spectrogram 2 for power spectrogram Defaults to 2. * **pad_EOS** (bool) --- whether to apply EOS tokens to both the text and the speech signal. Will pad at least 1 token regardless of pad_to value. Defaults to True. * **pad_value** (float) --- The value we pad the spectrogram with. Defaults to np.log(data_min). * **pad_to** (int) --- we pad such that the resulting datapoint is a multiple of pad_to. Defaults to 8. * **trim** (bool) --- Whether to trim silence via librosa or not. Defaults to False. * **data_min** (float) --- min clip value prior to taking the log. Defaults to 1e-5. Please change to 1e-2 if using htk mels. * **duration_min** (int) --- Minimum duration in steps for speech signal. All signals less than this will be cut from the training set. Defaults to 0. * **duration_max** (int) --- Maximum duration in steps for speech signal. All signals greater than this will be cut from the training set. Defaults to 4000. * **mel_type** (str): One of ['slaney', 'htk']. Decides which algorithm to use to compute mel specs. Defaults to htk. """ super(Text2SpeechDataLayer, self).__init__( params, model, num_workers, worker_id ) names = ['wav_filename', 'raw_transcript', 'transcript'] sep = '\x7c' header = None if self.params["dataset"] == "LJ": self._sampling_rate = 22050 self._n_fft = 1024 elif self.params["dataset"] == "MAILABS": self._sampling_rate = 16000 self._n_fft = 800 # Character level vocab self.params['char2idx'] = load_pre_existing_vocabulary( self.params['vocab_file'], min_idx=3, read_chars=True, ) # Add the pad, start, and end chars self.params['char2idx']['<p>'] = 0 self.params['char2idx']['<s>'] = 1 self.params['char2idx']['</s>'] = 2 self.params['idx2char'] = {i: w for w, i in self.params['char2idx'].items()} self.params['src_vocab_size'] = len(self.params['char2idx']) n_feats = self.params['num_audio_features'] if "both" in self.params["output_type"]: self._both = True if self.params["feature_normalize"]: raise ValueError( "feature normalize is not currently enabled for both mode" ) if not isinstance(n_feats, dict): raise ValueError( "num_audio_features must be a dictionary for both mode" ) else: if ("mel" not in n_feats and "magnitude" not in n_feats): raise ValueError( "num_audio_features must contain mel and magnitude keys" ) elif (not isinstance(n_feats["mel"], int) or not isinstance(n_feats["magnitude"], int)): raise ValueError( "num_audio_features must be a int" ) n_mels = n_feats['mel'] data_min = self.params.get("data_min", None) if data_min is not None: if not isinstance(data_min, dict): raise ValueError( "data_min must be a dictionary for both mode" ) else: if "mel" not in data_min and "magnitude" not in data_min: raise ValueError( "data_min must contain mel and magnitude keys" ) elif (not isinstance(data_min["mel"], float) or not isinstance(data_min["magnitude"], float)): raise ValueError( "data_min must be a float" ) self._exp_mag = self.params.get("exp_mag", True) else: if not isinstance(n_feats, int): raise ValueError( "num_audio_features must be a float for mel or magnitude mode" ) if not isinstance(self.params.get("data_min",1.0), float): raise ValueError( "data_min must be a float for mel or magnitude mode" ) self._both = False self._exp_mag = False n_mels = n_feats self._mel = "mel" in self.params["output_type"] if self._mel or self._both: htk = True norm = None if self.params.get('mel_type', 'htk') == 'slaney': htk = False norm = 1 self._mel_basis = librosa.filters.mel( sr=self._sampling_rate, n_fft=self._n_fft, n_mels=n_mels, htk=htk, norm=norm ) else: self._mel_basis = None if self.params["interactive"]: return # Load csv files self._files = None for csvs in params['dataset_files']: files = pd.read_csv( csvs, encoding='utf-8', sep=sep, header=header, names=names, quoting=3 ) if self._files is None: self._files = files else: self._files = self._files.append(files) if self.params['mode'] != 'infer': cols = ['wav_filename', 'transcript'] else: cols = 'transcript' all_files = self._files.loc[:, cols].values self._files = self.split_data(all_files) self._size = self.get_size_in_samples() self._dataset = None self._iterator = None self._input_tensors = None
def __init__(self, params, model, num_workers=1, worker_id=0): super(ParallelTextDataLayer, self).__init__(params, model, num_workers, worker_id) self._batch_size = self.params['batch_size'] self.source_file = self.params['source_file'] self._use_targets = self.params.get('use_targets', True) if not self._use_targets: self.target_file = self.source_file if 'target_file' in self.params: print("WARNING: target file was specified but was " "ignored by data layer because 'use_targets'=False") else: self.target_file = self.params['target_file'] self.src_vocab_file = self.params['src_vocab_file'] self.tgt_vocab_file = self.params['tgt_vocab_file'] self.max_len = self.params['max_length'] self._delimiter = self.params.get('delimiter', ' ') self._map_parallel_calls = self.params.get('map_parallel_calls', 8) self._pad_lengths_to_eight = self.params.get('pad_lengths_to_eight', False) self._prefetch_buffer_size = self.params.get('prefetch_buffer_size', tf.contrib.data.AUTOTUNE) self._shuffle_buffer_size = self.params.get('shuffle_buffer_size', -1) self._num_workers = num_workers self._worker_id = worker_id self._use_start_token = self.params.get('use_start_token', True) if self._pad_lengths_to_eight and not (self.params['max_length'] % 8 == 0): raise ValueError("If padding to 8 in data layer, then " "max_length should be multiple of 8") def file_len(fname): with open(fname) as f: for i, l in enumerate(f): pass return i + 1 self.dataset_size = file_len(self.source_file) special_tokens_already_in_vocab = self.params.get( 'special_tokens_already_in_vocab', True) # load source and target vocabularies to RAM self.src_seq2idx = load_pre_existing_vocabulary( self.src_vocab_file, min_idx=0 if special_tokens_already_in_vocab else SpecialTextTokens.UNK_ID.value + 1) self.tgt_seq2idx = load_pre_existing_vocabulary( self.tgt_vocab_file, min_idx=0 if special_tokens_already_in_vocab else SpecialTextTokens.UNK_ID.value + 1) if not special_tokens_already_in_vocab: # manually add special tokens # unknown symbol self.src_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.UNK_ID.value)] = \ SpecialTextTokens.UNK_ID.value self.tgt_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.UNK_ID.value)] = \ SpecialTextTokens.UNK_ID.value # sentence start self.src_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.S_ID.value)] = \ SpecialTextTokens.S_ID.value self.tgt_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.S_ID.value)] = \ SpecialTextTokens.S_ID.value # sentence end self.src_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.EOS_ID.value)] = \ SpecialTextTokens.EOS_ID.value self.tgt_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.EOS_ID.value)] = \ SpecialTextTokens.EOS_ID.value # padding self.src_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.PAD_ID.value)] = \ SpecialTextTokens.PAD_ID.value self.tgt_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.PAD_ID.value)] = \ SpecialTextTokens.PAD_ID.value if self.params.get('pad_vocab_to_eight', False): self.src_seq2idx = pad_vocab_to_eight(self.src_seq2idx) self.tgt_seq2idx = pad_vocab_to_eight(self.tgt_seq2idx) self.src_idx2seq = {idx: w for w, idx in self.src_seq2idx.items()} self.tgt_idx2seq = {idx: w for w, idx in self.tgt_seq2idx.items()} self.params['src_vocab_size'] = len(self.src_seq2idx) self.params['tgt_vocab_size'] = len(self.tgt_seq2idx) self.params['target_seq2idx'] = self.tgt_seq2idx self.params['source_seq2idx'] = self.src_seq2idx self.params['target_idx2seq'] = self.tgt_idx2seq self.params['source_idx2seq'] = self.src_idx2seq self._input_tensors = {}
def __init__(self, params, model, num_workers, worker_id): """Speech-to-text data layer constructor. See parent class for arguments description. Config parameters: * **num_audio_features** (int) --- number of audio features to extract. * **input_type** (str) --- could be either "spectrogram" or "mfcc". * **vocab_file** (str) --- path to vocabulary file or sentencepiece model. * **dataset_files** (list) --- list with paths to all dataset .csv files. * **augmentation** (dict) --- optional dictionary with data augmentation parameters. Can contain "time_stretch_ratio", "noise_level_min" and "noise_level_max" parameters, e.g.:: { 'time_stretch_ratio': 0.05, 'noise_level_min': -90, 'noise_level_max': -60, } For additional details on these parameters see :func:`data.speech2text.speech_utils.augment_audio_signal` function. * **autoregressive** (bool) --- boolean indicating whether the model is autoregressive. * **syn_enable** (bool) --- boolean indicating whether the model is using synthetic data. * **syn_subdirs** (list) --- must be defined if using synthetic mode. Contains a list of subdirectories that hold the synthetica wav files. """ super(Speech2TextDataLayer, self).__init__(params, model, num_workers, worker_id) # we need this until python_speech_features gets update on pypi.org self.apply_window = 'winfunc' in inspect.getargspec(psf.logfbank)[0] if not self.apply_window and \ (self.params['input_type'] == 'mfcc' or \ self.params['input_type'] == 'logfbank'): print('WARNING: using python_speech_features WITHOUT windowing function') print('Please install the latest python_speech_features (from GitHub)') self.params['autoregressive'] = self.params.get('autoregressive', False) self.autoregressive = self.params['autoregressive'] self.params['bpe'] = self.params.get('bpe', False) if self.params['bpe']: self.sp = spm.SentencePieceProcessor() self.sp.Load(self.params['vocab_file']) self.params['tgt_vocab_size'] = len(self.sp) + 1 else: self.params['char2idx'] = load_pre_existing_vocabulary( self.params['vocab_file'], read_chars=True, ) if not self.autoregressive: # add one for implied blank token self.params['tgt_vocab_size'] = len(self.params['char2idx']) + 1 else: num_chars_orig = len(self.params['char2idx']) self.params['tgt_vocab_size'] = num_chars_orig + 2 self.start_index = num_chars_orig self.end_index = num_chars_orig + 1 self.params['char2idx']['<S>'] = self.start_index self.params['char2idx']['</S>'] = self.end_index self.target_pad_value = self.end_index self.params['idx2char'] = {i: w for w, i in self.params['char2idx'].items()} self.target_pad_value = 0 self._files = None if self.params["interactive"]: self.params['max_duration'] = params.get('max_duration', -1.0) self.params['window_size'] = params.get('window_size', 20e-3) self.params['window_stride'] = params.get('window_stride', 10e-3) return for csv in params['dataset_files']: files = pd.read_csv(csv, encoding='utf-8') if self._files is None: self._files = files else: self._files = self._files.append(files) if self.params['mode'] != 'infer': cols = ['wav_filename', 'transcript'] else: cols = 'wav_filename' self.all_files = self._files.loc[:, cols].values self._files = self.split_data(self.all_files) self._size = self.get_size_in_samples() self._dataset = None self._iterator = None self._input_tensors = None self.params['max_duration'] = params.get('max_duration', -1.0) self.params['window_size'] = params.get('window_size', 20e-3) self.params['window_stride'] = params.get('window_stride', 10e-3)