def __init__(self, params, model, num_workers=1, worker_id=0): super(TransformerDataLayer, self).__init__(params, model, num_workers, worker_id) self.src_vocab_file = self.params['src_vocab_file'] # if tgt vocab isn't specified - assume common vocab file self.tgt_vocab_file = self.params.get('tgt_vocab_file', self.src_vocab_file) # load source and target vocabularies to RAM # pre-processed vocab starts from PAD, EOS self.src_seq2idx = load_pre_existing_vocabulary( self.src_vocab_file, min_idx=PAD_ID) self.tgt_seq2idx = load_pre_existing_vocabulary( self.tgt_vocab_file, min_idx=PAD_ID) self.src_idx2seq = {idx: w for w, idx in self.src_seq2idx.items()} self.tgt_idx2seq = {idx: w for w, idx in self.tgt_seq2idx.items()} self.params['src_vocab_size'] = len(self.src_seq2idx) self.params['tgt_vocab_size'] = len(self.tgt_seq2idx) self.params['target_seq2idx'] = self.tgt_seq2idx self.params['source_seq2idx'] = self.src_seq2idx self.params['target_idx2seq'] = self.tgt_idx2seq self.params['source_idx2seq'] = self.src_idx2seq self._num_workers = num_workers self._worker_id = worker_id self._input_tensors = {} self._iterator = None self.batched_dataset = None
def __init__(self, params, model, num_workers=1, worker_id=0): super(TransformerDataLayer, self).__init__(params, model, num_workers, worker_id) self.src_vocab_file = self.params['src_vocab_file'] # if tgt vocab isn't specified - assume common vocab file self.tgt_vocab_file = self.params.get('tgt_vocab_file', self.src_vocab_file) # load source and target vocabularies to RAM # pre-processed vocab starts from PAD, EOS self.src_seq2idx = load_pre_existing_vocabulary( self.src_vocab_file, min_idx=PAD_ID) self.tgt_seq2idx = load_pre_existing_vocabulary( self.tgt_vocab_file, min_idx=PAD_ID) self.src_idx2seq = {idx: w for w, idx in self.src_seq2idx.items()} self.tgt_idx2seq = {idx: w for w, idx in self.tgt_seq2idx.items()} self.params['src_vocab_size'] = len(self.src_seq2idx) self.params['tgt_vocab_size'] = len(self.tgt_seq2idx) self.params['target_seq2idx'] = self.tgt_seq2idx self.params['source_seq2idx'] = self.src_seq2idx self.params['target_idx2seq'] = self.tgt_idx2seq self.params['source_idx2seq'] = self.src_idx2seq self._num_workers = num_workers self._worker_id = worker_id self._input_tensors = {} self._iterator = None self.batched_dataset = None
def __init__(self, params, model, num_workers, worker_id): """Speech-to-text data layer constructor. See parent class for arguments description. Config parameters: * **num_audio_features** (int) --- number of audio features to extract. * **input_type** (str) --- could be either "spectrogram" or "mfcc". * **vocab_file** (str) --- path to vocabulary file. * **dataset_files** (list) --- list with paths to all dataset .csv files. * **augmentation** (dict) --- optional dictionary with data augmentation parameters. Can contain "time_stretch_ratio", "noise_level_min" and "noise_level_max" parameters, e.g.:: { 'time_stretch_ratio': 0.05, 'noise_level_min': -90, 'noise_level_max': -60, } For additional details on these parameters see :func:`data.speech2text.speech_utils.augment_audio_signal` function. """ super(Speech2TextDataLayer, self).__init__(params, model, num_workers, worker_id) self.params['char2idx'] = load_pre_existing_vocabulary( self.params['vocab_file'], read_chars=True, ) self.params['idx2char'] = { i: w for w, i in self.params['char2idx'].items() } # add one for implied blank token self.params['tgt_vocab_size'] = len(self.params['char2idx']) + 1 self._files = None for csv in params['dataset_files']: files = pd.read_csv(csv, encoding='utf-8') if self._files is None: self._files = files else: self._files = self._files.append(files) if self.params['mode'] != 'infer': cols = ['wav_filename', 'transcript'] else: cols = 'wav_filename' self.all_files = self._files.loc[:, cols].values self._files = self.split_data(self.all_files) self._size = self.get_size_in_samples() self._dataset = None self._iterator = None self._input_tensors = None
def __init__(self, params, model, num_workers=None, worker_id=None): """Speech-to-text data layer constructor. See parent class for arguments description. Config parameters: * **num_audio_features** (int) --- number of audio features to extract. * **input_type** (str) --- could be either "spectrogram" or "mfcc". * **vocab_file** (str) --- path to vocabulary file. * **dataset_files** (list) --- list with paths to all dataset .csv files. * **augmentation** (dict) --- optional dictionary with data augmentation parameters. Can contain "time_stretch_ratio", "noise_level_min" and "noise_level_max" parameters, e.g.:: { 'time_stretch_ratio': 0.05, 'noise_level_min': -90, 'noise_level_max': -60, } For additional details on these parameters see :func:`data.speech2text.speech_utils.augment_audio_signal` function. """ super(Speech2TextDataLayer, self).__init__(params, model, num_workers, worker_id) self.params['char2idx'] = load_pre_existing_vocabulary( self.params['vocab_file'], read_chars=True, ) self.params['idx2char'] = {i: w for w, i in self.params['char2idx'].items()} # add one for implied blank token self.params['tgt_vocab_size'] = len(self.params['char2idx']) + 1 self._files = None for csv in params['dataset_files']: files = pd.read_csv(csv, encoding='utf-8') if self._files is None: self._files = files else: self._files = self._files.append(files) if self.params['mode'] != 'infer': cols = ['wav_filename', 'transcript'] else: cols = 'wav_filename' self.all_files = self._files.loc[:, cols].values self._files = self.split_data(self.all_files) self._size = self.get_size_in_samples() self._dataset = None self._iterator = None self._input_tensors = None
def __init__(self, params, model, num_workers, worker_id): """Speech-to-text data layer constructor. See parent class for arguments description. Config parameters: * **backend** (str) --- audio pre-processing backend ('psf' [default] or librosa [recommended]). * **num_audio_features** (int) --- number of audio features to extract. * **input_type** (str) --- could be either "spectrogram" or "mfcc". * **vocab_file** (str) --- path to vocabulary file or sentencepiece model. * **dataset_files** (list) --- list with paths to all dataset .csv files. * **augmentation** (dict) --- optional dictionary with data augmentation parameters. Can contain "speed_perturbation_ratio", "noise_level_min" and "noise_level_max" parameters, e.g.:: { 'speed_perturbation_ratio': 0.05, 'noise_level_min': -90, 'noise_level_max': -60, } For additional details on these parameters see :func:`data.speech2text.speech_utils.augment_audio_signal` function. * **pad_to** (int) --- align audio sequence length to pad_to value. * **max_duration** (float) --- drop all samples longer than **max_duration** (seconds) * **min_duration** (float) --- drop all samples shorter than **min_duration** (seconds) * **bpe** (bool) --- use BPE encodings * **autoregressive** (bool) --- boolean indicating whether the model is autoregressive. * **syn_enable** (bool) --- boolean indicating whether the model is using synthetic data. * **syn_subdirs** (list) --- must be defined if using synthetic mode. Contains a list of subdirectories that hold the synthetica wav files. * **window_size** (float) --- window's duration (in seconds) * **window_stride** (float) --- window's stride (in seconds) * **dither** (float) --- weight of Gaussian noise to apply to input signal for dithering/preventing quantization noise * **num_fft** (int) --- size of fft window to use if features require fft, defaults to smallest power of 2 larger than window size * **norm_per_feature** (bool) --- if True, the output features will be normalized (whitened) individually. if False, a global mean/std over all features will be used for normalization. * **window** (str) --- window function to apply before FFT ('hanning', 'hamming', 'none') * **num_fft** (int) --- optional FFT size * **precompute_mel_basis** (bool) --- compute and store mel basis. If False, it will compute it for every get_speech_features call. Default: False * **sample_freq** (int) --- required for precompute_mel_basis """ super(Speech2TextDataLayer, self).__init__(params, model, num_workers, worker_id) self.params['autoregressive'] = self.params.get('autoregressive', False) self.autoregressive = self.params['autoregressive'] self.params['bpe'] = self.params.get('bpe', False) if self.params['bpe']: self.sp = spm.SentencePieceProcessor() self.sp.Load(self.params['vocab_file']) self.params['tgt_vocab_size'] = len(self.sp) + 1 else: self.params['char2idx'] = load_pre_existing_vocabulary( self.params['vocab_file'], read_chars=True, ) if not self.autoregressive: # add one for implied blank token self.params['tgt_vocab_size'] = len(self.params['char2idx']) + 1 else: num_chars_orig = len(self.params['char2idx']) self.params['tgt_vocab_size'] = num_chars_orig + 2 self.start_index = num_chars_orig self.end_index = num_chars_orig + 1 self.params['char2idx']['<S>'] = self.start_index self.params['char2idx']['</S>'] = self.end_index self.target_pad_value = self.end_index self.params['idx2char'] = {i: w for w, i in self.params['char2idx'].items()} self.target_pad_value = 0 self._files = None if self.params["interactive"]: return for csv in params['dataset_files']: files = pd.read_csv(csv, encoding='utf-8') files.dropna(subset=["transcript"], inplace=True) if self._files is None: self._files = files else: self._files = self._files.append(files) if self.params['mode'] != 'infer': cols = ['wav_filename', 'transcript'] else: cols = 'wav_filename' self.all_files = self._files.loc[:, cols].values self._files = self.split_data(self.all_files) self._size = self.get_size_in_samples() self._dataset = None self._iterator = None self._input_tensors = None self.params['min_duration'] = self.params.get('min_duration', -1.0) self.params['max_duration'] = self.params.get('max_duration', -1.0) self.params['window_size'] = self.params.get('window_size', 20e-3) self.params['window_stride'] = self.params.get('window_stride', 10e-3) self.params['sample_freq'] = self.params.get('sample_freq', 16000) mel_basis = None if (self.params.get("precompute_mel_basis", False) and self.params["input_type"] == "logfbank"): num_fft = ( self.params.get("num_fft", None) or 2**math.ceil(math.log2( self.params['window_size']*self.params['sample_freq']) ) ) mel_basis = librosa.filters.mel( self.params['sample_freq'], num_fft, n_mels=self.params['num_audio_features'], fmin=0, fmax=int(self.params['sample_freq']/2) ) self.params['mel_basis'] = mel_basis if 'n_freq_mask' in self.params.get('augmentation', {}): width_freq_mask = self.params['augmentation'].get('width_freq_mask', 10) if width_freq_mask > self.params['num_audio_features']: raise ValueError( "'width_freq_mask'={} should be smaller ".format(width_freq_mask) + "than 'num_audio_features'={}".format( self.params['num_audio_features'] ) ) if 'time_stretch_ratio' in self.params.get('augmentation', {}): print("WARNING: Please update time_stretch_ratio to speed_perturbation_ratio") self.params['augmentation']['speed_perturbation_ratio'] = self.params['augmentation']['time_stretch_ratio']
def __init__(self, params, model, num_workers, worker_id): """Speech-to-text data layer constructor. See parent class for arguments description. Config parameters: * **num_audio_features** (int) --- number of audio features to extract. * **input_type** (str) --- could be either "spectrogram" or "mfcc". * **vocab_file** (str) --- path to vocabulary file or sentencepiece model. * **dataset_files** (list) --- list with paths to all dataset .csv files. * **augmentation** (dict) --- optional dictionary with data augmentation parameters. Can contain "time_stretch_ratio", "noise_level_min" and "noise_level_max" parameters, e.g.:: { 'time_stretch_ratio': 0.05, 'noise_level_min': -90, 'noise_level_max': -60, } For additional details on these parameters see :func:`data.speech2text.speech_utils.augment_audio_signal` function. * **autoregressive** (bool) --- boolean indicating whether the model is autoregressive. * **syn_enable** (bool) --- boolean indicating whether the model is using synthetic data. * **syn_subdirs** (list) --- must be defined if using synthetic mode. Contains a list of subdirectories that hold the synthetica wav files. """ super(Speech2TextDataLayer, self).__init__(params, model, num_workers, worker_id) self.params['autoregressive'] = self.params.get('autoregressive', False) self.autoregressive = self.params['autoregressive'] self.params['bpe'] = self.params.get('bpe', False) if self.params['bpe']: self.sp = spm.SentencePieceProcessor() self.sp.Load(self.params['vocab_file']) self.params['tgt_vocab_size'] = len(self.sp) + 1 else: self.params['char2idx'] = load_pre_existing_vocabulary( self.params['vocab_file'], read_chars=True, ) if not self.autoregressive: # add one for implied blank token self.params['tgt_vocab_size'] = len(self.params['char2idx']) + 1 else: num_chars_orig = len(self.params['char2idx']) self.params['tgt_vocab_size'] = num_chars_orig + 2 self.start_index = num_chars_orig self.end_index = num_chars_orig + 1 self.params['char2idx']['<S>'] = self.start_index self.params['char2idx']['</S>'] = self.end_index self.target_pad_value = self.end_index self.params['idx2char'] = {i: w for w, i in self.params['char2idx'].items()} self.target_pad_value = 0 self._files = None if self.params["interactive"]: return for csv in params['dataset_files']: files = pd.read_csv(csv, encoding='utf-8') if self._files is None: self._files = files else: self._files = self._files.append(files) if self.params['mode'] != 'infer': cols = ['wav_filename', 'transcript'] else: cols = 'wav_filename' self.all_files = self._files.loc[:, cols].values self._files = self.split_data(self.all_files) self._size = self.get_size_in_samples() self._dataset = None self._iterator = None self._input_tensors = None self.params['max_duration'] = params.get('max_duration', -1.0) self.params['window_size'] = params.get('window_size', 20e-3) self.params['window_stride'] = params.get('window_stride', 10e-3)
def __init__(self, params, model, num_workers=None, worker_id=None): """Text-to-speech data layer constructor. See parent class for arguments description. Config parameters: * **dataset** (str) --- The dataset to use. Currently 'LJ' for the LJSpeech 1.1 dataset is supported. * **num_audio_features** (int) --- number of audio features to extract. * **output_type** (str) --- could be either "magnitude", or "mel". * **vocab_file** (str) --- path to vocabulary file. * **dataset_files** (list) --- list with paths to all dataset .csv files. File is assumed to be separated by "|". * **dataset_location** (string) --- string with path to directory where wavs are stored. * **feature_normalize** (bool) --- whether to normlize the data with a preset mean and std * **feature_normalize_mean** (bool) --- used for feature normalize. Defaults to 0. * **feature_normalize_std** (bool) --- used for feature normalize. Defaults to 1. * **mag_power** (int) --- the power to which the magnitude spectrogram is scaled to. Defaults to 1. 1 for energy spectrogram 2 for power spectrogram Defaults to 2. * **pad_EOS** (bool) --- whether to apply EOS tokens to both the text and the speech signal. Will pad at least 1 token regardless of pad_to value. Defaults to True. * **pad_value** (float) --- The value we pad the spectrogram with. Defaults to np.log(data_min). * **pad_to** (int) --- we pad such that the resulting datapoint is a multiple of pad_to. Defaults to 8. * **trim** (bool) --- Whether to trim silence via librosa or not. Defaults to False. * **data_min** (float) --- min clip value prior to taking the log. Defaults to 1e-5. Please change to 1e-2 if using htk mels. * **duration_min** (int) --- Minimum duration in steps for speech signal. All signals less than this will be cut from the training set. Defaults to 0. * **duration_max** (int) --- Maximum duration in steps for speech signal. All signals greater than this will be cut from the training set. Defaults to 4000. * **mel_type** (str) --- One of ['slaney', 'htk']. Decides which algorithm to use to compute mel specs. Defaults to htk. * **style_input** (str) --- Can be either None or "wav". Must be set to "wav" for GST. Defaults to None. """ super(Text2SpeechDataLayer, self).__init__(params, model, num_workers, worker_id) names = ['wav_filename', 'raw_transcript', 'transcript'] sep = '\x7c' header = None if self.params["dataset"] == "LJ": self._sampling_rate = 22050 self._n_fft = 1024 elif self.params["dataset"] == "MAILABS": self._sampling_rate = 16000 self._n_fft = 800 # Character level vocab self.params['char2idx'] = load_pre_existing_vocabulary( self.params['vocab_file'], min_idx=3, read_chars=True, ) # Add the pad, start, and end chars self.params['char2idx']['<p>'] = 0 self.params['char2idx']['<s>'] = 1 self.params['char2idx']['</s>'] = 2 self.params['idx2char'] = { i: w for w, i in self.params['char2idx'].items() } self.params['src_vocab_size'] = len(self.params['char2idx']) n_feats = self.params['num_audio_features'] if "both" in self.params["output_type"]: self._both = True if self.params["feature_normalize"]: raise ValueError( "feature normalize is not currently enabled for both mode") if not isinstance(n_feats, dict): raise ValueError( "num_audio_features must be a dictionary for both mode") else: if ("mel" not in n_feats and "magnitude" not in n_feats): raise ValueError( "num_audio_features must contain mel and magnitude keys" ) elif (not isinstance(n_feats["mel"], int) or not isinstance(n_feats["magnitude"], int)): raise ValueError("num_audio_features must be a int") n_mels = n_feats['mel'] data_min = self.params.get("data_min", None) if data_min is not None: if not isinstance(data_min, dict): raise ValueError( "data_min must be a dictionary for both mode") else: if "mel" not in data_min and "magnitude" not in data_min: raise ValueError( "data_min must contain mel and magnitude keys") elif (not isinstance(data_min["mel"], float) or not isinstance(data_min["magnitude"], float)): raise ValueError("data_min must be a float") self._exp_mag = self.params.get("exp_mag", True) else: if not isinstance(n_feats, int): raise ValueError( "num_audio_features must be a float for mel or magnitude mode" ) if not isinstance(self.params.get("data_min", 1.0), float): raise ValueError( "data_min must be a float for mel or magnitude mode") self._both = False self._exp_mag = False n_mels = n_feats self._mel = "mel" in self.params["output_type"] if self._mel or self._both: htk = True norm = None if self.params.get('mel_type', 'htk') == 'slaney': htk = False norm = 1 self._mel_basis = librosa.filters.mel(sr=self._sampling_rate, n_fft=self._n_fft, n_mels=n_mels, htk=htk, norm=norm) else: self._mel_basis = None if self.params["interactive"]: return # Load csv files self._files = None for csvs in params['dataset_files']: files = pd.read_csv(csvs, encoding='utf-8', sep=sep, header=header, names=names, quoting=3) if self._files is None: self._files = files else: self._files = self._files.append(files) if (self.params['mode'] != 'infer' or self.params.get("style_input", None) == "wav"): cols = ['wav_filename', 'transcript'] else: cols = 'transcript' all_files = self._files.loc[:, cols].values self._files = self.split_data(all_files) self._size = self.get_size_in_samples() self._dataset = None self._iterator = None self._input_tensors = None
def __init__(self, params, model, num_workers=1, worker_id=0): super(ParallelTextDataLayer, self).__init__(params, model, num_workers, worker_id) self._batch_size = self.params['batch_size'] self.source_file = self.params['source_file'] self._use_targets = self.params.get('use_targets', True) if not self._use_targets: self.target_file = self.source_file if 'target_file' in self.params: print("WARNING: target file was specified but was " "ignored by data layer because 'use_targets'=False") else: self.target_file = self.params['target_file'] self.src_vocab_file = self.params['src_vocab_file'] self.tgt_vocab_file = self.params['tgt_vocab_file'] self.max_len = self.params['max_length'] self._delimiter = self.params.get('delimiter', ' ') self._map_parallel_calls = self.params.get('map_parallel_calls', 8) self._pad_lengths_to_eight = self.params.get('pad_lengths_to_eight', False) self._prefetch_buffer_size = self.params.get('prefetch_buffer_size', tf.contrib.data.AUTOTUNE) self._shuffle_buffer_size = self.params.get('shuffle_buffer_size', -1) self._num_workers = num_workers self._worker_id = worker_id self._use_start_token = self.params.get('use_start_token', True) if self._pad_lengths_to_eight and not (self.params['max_length'] % 8 == 0): raise ValueError("If padding to 8 in data layer, then " "max_length should be multiple of 8") def file_len(fname): with open(fname,encoding="utf-8") as f: for i, l in enumerate(f): pass return i + 1 self.dataset_size = file_len(self.source_file) special_tokens_already_in_vocab = self.params.get('special_tokens_already_in_vocab', True) # load source and target vocabularies to RAM self.src_seq2idx = load_pre_existing_vocabulary( self.src_vocab_file, min_idx=0 if special_tokens_already_in_vocab else SpecialTextTokens.UNK_ID.value + 1) self.tgt_seq2idx = load_pre_existing_vocabulary( self.tgt_vocab_file, min_idx=0 if special_tokens_already_in_vocab else SpecialTextTokens.UNK_ID.value + 1) if not special_tokens_already_in_vocab: # manually add special tokens # unknown symbol self.src_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.UNK_ID.value)] = \ SpecialTextTokens.UNK_ID.value self.tgt_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.UNK_ID.value)] = \ SpecialTextTokens.UNK_ID.value # sentence start self.src_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.S_ID.value)] = \ SpecialTextTokens.S_ID.value self.tgt_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.S_ID.value)] = \ SpecialTextTokens.S_ID.value # sentence end self.src_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.EOS_ID.value)] = \ SpecialTextTokens.EOS_ID.value self.tgt_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.EOS_ID.value)] = \ SpecialTextTokens.EOS_ID.value # padding self.src_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.PAD_ID.value)] = \ SpecialTextTokens.PAD_ID.value self.tgt_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.PAD_ID.value)] = \ SpecialTextTokens.PAD_ID.value if self.params.get('pad_vocab_to_eight', False): self.src_seq2idx = pad_vocab_to_eight(self.src_seq2idx) self.tgt_seq2idx = pad_vocab_to_eight(self.tgt_seq2idx) self.src_idx2seq = {idx: w for w, idx in self.src_seq2idx.items()} self.tgt_idx2seq = {idx: w for w, idx in self.tgt_seq2idx.items()} self.params['src_vocab_size'] = len(self.src_seq2idx) self.params['tgt_vocab_size'] = len(self.tgt_seq2idx) self.params['target_seq2idx'] = self.tgt_seq2idx self.params['source_seq2idx'] = self.src_seq2idx self.params['target_idx2seq'] = self.tgt_idx2seq self.params['source_idx2seq'] = self.src_idx2seq self._input_tensors = {}
def __init__(self, params, model, num_workers=None, worker_id=None): """Text-to-speech data layer constructor. See parent class for arguments description. Config parameters: * **num_audio_features** (int) --- number of audio features to extract. * **output_type** (str) --- could be either "magnitude", or "mel". * **vocab_file** (str) --- path to vocabulary file. * **dataset_files** (list) --- list with paths to all dataset .csv files. File is assumed to be separated by "|". * **dataset_location** (string) --- string with path to directory where wavs are stored. * **feature_normalize** (bool) --- whether to normlize the data with a preset mean and std * **feature_normalize_mean** (bool) --- used for feature normalize. Defaults to 0. * **feature_normalize_std** (bool) --- used for feature normalize. Defaults to 1. * **mag_power** (int) --- the power to which the magnitude spectrogram is scaled to: 1 for energy spectrogram 2 for power spectrogram Defaults to 2. * **pad_EOS** (bool) --- whether to apply EOS tokens to both the text and the speech signal. Will pad at least 1 token regardless of pad_to value. Defaults to True. * **pad_to** (int) --- we pad such that the resulting datapoint is a multiple of pad_to. Defaults to 8. """ super(Text2SpeechDataLayer, self).__init__(params, model, num_workers, worker_id) # Character level vocab self.params['char2idx'] = load_pre_existing_vocabulary( self.params['vocab_file'], read_chars=True, ) self.params['idx2char'] = { i: w for w, i in self.params['char2idx'].items() } # add one for implied blank token self.params['src_vocab_size'] = len(self.params['char2idx']) + 1 # This assumes that the LJSpeech dataset is used if "mel" in self.params["output_type"]: self._mel = True self._mel_basis = librosa.filters.mel( sr=22050, n_fft=1024, n_mels=self.params['num_audio_features']) else: self._mel = False # The rest of the code is not needed for interactive infer if self.params["interactive"]: return if "dataset_files" not in self.params: raise ValueError("dataset_files parameter has to be specified") names = ['wav_filename', 'transcript', 'transcript_normalized'] if "disk" in self.params["output_type"]: self._load_from_disk = True else: self._load_from_disk = False # Load csv files self._files = None for csvs in params['dataset_files']: files = pd.read_csv(csvs, encoding='utf-8', sep='\x7c', header=None, names=names, quoting=3) if self._files is None: self._files = files else: self._files = self._files.append(files) if self.params['mode'] != 'infer': cols = ['wav_filename', 'transcript_normalized'] else: cols = 'transcript_normalized' all_files = self._files.loc[:, cols].values self._files = self.split_data(all_files) self._size = self.get_size_in_samples() self._dataset = None self._iterator = None self._input_tensors = None
def __init__(self, params, model, num_workers=1, worker_id=0): super(ParallelTextDataLayer, self).__init__(params, model, num_workers, worker_id) self._batch_size = self.params['batch_size'] self.source_file = self.params['source_file'] self._use_targets = self.params.get('use_targets', True) if not self._use_targets: self.target_file = self.source_file if 'target_file' in self.params: print("WARNING: target file was specified but was " "ignored by data layer because 'use_targets'=False") else: self.target_file = self.params['target_file'] self.src_vocab_file = self.params['src_vocab_file'] self.tgt_vocab_file = self.params['tgt_vocab_file'] self.max_len = self.params['max_length'] self._delimiter = self.params.get('delimiter', ' ') self._map_parallel_calls = self.params.get('map_parallel_calls', 8) self._pad_lengths_to_eight = self.params.get('pad_lengths_to_eight', False) self._prefetch_buffer_size = self.params.get('prefetch_buffer_size', 4) self._num_workers = num_workers self._worker_id = worker_id if self._pad_lengths_to_eight and not (self.params['max_length'] % 8 == 0): raise ValueError("If padding to 8 in data layer, then " "max_length should be multiple of 8") def file_len(fname): with open(fname) as f: for i, l in enumerate(f): pass return i + 1 self.dataset_size = file_len(self.source_file) # load source and target vocabularies to RAM self.src_seq2idx = load_pre_existing_vocabulary( self.src_vocab_file, min_idx=SpecialTextTokens.UNK_ID.value + 1) self.tgt_seq2idx = load_pre_existing_vocabulary( self.tgt_vocab_file, min_idx=SpecialTextTokens.UNK_ID.value + 1) # unknown symbol self.src_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.UNK_ID.value)] = \ SpecialTextTokens.UNK_ID.value self.tgt_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.UNK_ID.value)] = \ SpecialTextTokens.UNK_ID.value # sentence start self.src_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.S_ID.value)] = \ SpecialTextTokens.S_ID.value self.tgt_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.S_ID.value)] = \ SpecialTextTokens.S_ID.value # sentence end self.src_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.EOS_ID.value)] = \ SpecialTextTokens.EOS_ID.value self.tgt_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.EOS_ID.value)] = \ SpecialTextTokens.EOS_ID.value # padding self.src_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.PAD_ID.value)] = \ SpecialTextTokens.PAD_ID.value self.tgt_seq2idx[ SpecialTextTokens.to_string(SpecialTextTokens.PAD_ID.value)] = \ SpecialTextTokens.PAD_ID.value if self.params.get('pad_vocab_to_eight', False): self.src_seq2idx = pad_vocab_to_eight(self.src_seq2idx) self.tgt_seq2idx = pad_vocab_to_eight(self.tgt_seq2idx) self.src_idx2seq = {idx: w for w, idx in self.src_seq2idx.items()} self.tgt_idx2seq = {idx: w for w, idx in self.tgt_seq2idx.items()} self.params['src_vocab_size'] = len(self.src_seq2idx) self.params['tgt_vocab_size'] = len(self.tgt_seq2idx) self.params['target_seq2idx'] = self.tgt_seq2idx self.params['source_seq2idx'] = self.src_seq2idx self.params['target_idx2seq'] = self.tgt_idx2seq self.params['source_idx2seq'] = self.src_idx2seq self._input_tensors = {}
def __init__(self, params, model, num_workers=None, worker_id=None): super(Text2SpeechDataLayer, self).__init__(params, model, num_workers, worker_id) self.use_cache = self.params.get('use_cache', False) self._cache = {} names = ['mel_file', 'transcript', "fileid"] sep = '\x7c' header = None # Character level vocab self.params['char2idx'] = load_pre_existing_vocabulary( self.params['vocab_file'], min_idx=3, read_chars=True, ) # Add the pad, start, and end chars self.params['char2idx']['<p>'] = 0 self.params['char2idx']['<s>'] = 1 self.params['char2idx']['</s>'] = 2 self.params['idx2char'] = { i: w for w, i in self.params['char2idx'].items() } self.params['src_vocab_size'] = len(self.params['char2idx']) # Load csv files self._files = None for csvs in params['dataset_files']: files = pd.read_csv(csvs, encoding='utf-8', sep=sep, header=header, names=names, quoting=3) if self._files is None: self._files = files else: self._files = self._files.append(files) if self.params['mode'] == 'train' and 'n_samples_train' in self.params: indices = self._files['transcript'].str.len().sort_values().index self._files = self._files.reindex(indices) n_samples = self.params.get('n_samples_train') print('Using just the {} shortest samples'.format(n_samples)) self._files = self._files.iloc[:n_samples] if self.params['mode'] == 'eval': indices = self._files['transcript'].str.len().sort_values().index self._files = self._files.reindex(indices) if 'n_samples_eval' in self.params: n_samples = self.params['n_samples_eval'] self._files = self._files.iloc[:n_samples] cols = ['mel_file', 'transcript', "fileid"] all_files = self._files.loc[:, cols].values all_files = [list(map(str, i)) for i in all_files] self._files = self.split_data(all_files) self._size = self.get_size_in_samples() self._dataset = None self._iterator = None self._input_tensors = None