示例#1
0
  def __init__(self, params, model, num_workers=1, worker_id=0):
    super(TransformerDataLayer, self).__init__(params, model,
                                               num_workers, worker_id)
    self.src_vocab_file = self.params['src_vocab_file']
    # if tgt vocab isn't specified - assume common vocab file
    self.tgt_vocab_file = self.params.get('tgt_vocab_file', self.src_vocab_file)

    # load source and target vocabularies to RAM
    # pre-processed vocab starts from PAD, EOS
    self.src_seq2idx = load_pre_existing_vocabulary(
      self.src_vocab_file,
      min_idx=PAD_ID)
    self.tgt_seq2idx = load_pre_existing_vocabulary(
      self.tgt_vocab_file,
      min_idx=PAD_ID)

    self.src_idx2seq = {idx: w for w, idx in self.src_seq2idx.items()}
    self.tgt_idx2seq = {idx: w for w, idx in self.tgt_seq2idx.items()}

    self.params['src_vocab_size'] = len(self.src_seq2idx)
    self.params['tgt_vocab_size'] = len(self.tgt_seq2idx)
    self.params['target_seq2idx'] = self.tgt_seq2idx
    self.params['source_seq2idx'] = self.src_seq2idx
    self.params['target_idx2seq'] = self.tgt_idx2seq
    self.params['source_idx2seq'] = self.src_idx2seq

    self._num_workers = num_workers
    self._worker_id = worker_id

    self._input_tensors = {}
    self._iterator = None
    self.batched_dataset = None
示例#2
0
  def __init__(self, params, model, num_workers=1, worker_id=0):
    super(TransformerDataLayer, self).__init__(params, model,
                                               num_workers, worker_id)
    self.src_vocab_file = self.params['src_vocab_file']
    # if tgt vocab isn't specified - assume common vocab file
    self.tgt_vocab_file = self.params.get('tgt_vocab_file', self.src_vocab_file)

    # load source and target vocabularies to RAM
    # pre-processed vocab starts from PAD, EOS
    self.src_seq2idx = load_pre_existing_vocabulary(
      self.src_vocab_file,
      min_idx=PAD_ID)
    self.tgt_seq2idx = load_pre_existing_vocabulary(
      self.tgt_vocab_file,
      min_idx=PAD_ID)

    self.src_idx2seq = {idx: w for w, idx in self.src_seq2idx.items()}
    self.tgt_idx2seq = {idx: w for w, idx in self.tgt_seq2idx.items()}

    self.params['src_vocab_size'] = len(self.src_seq2idx)
    self.params['tgt_vocab_size'] = len(self.tgt_seq2idx)
    self.params['target_seq2idx'] = self.tgt_seq2idx
    self.params['source_seq2idx'] = self.src_seq2idx
    self.params['target_idx2seq'] = self.tgt_idx2seq
    self.params['source_idx2seq'] = self.src_idx2seq

    self._num_workers = num_workers
    self._worker_id = worker_id

    self._input_tensors = {}
    self._iterator = None
    self.batched_dataset = None
示例#3
0
    def __init__(self, params, model, num_workers, worker_id):
        """Speech-to-text data layer constructor.

    See parent class for arguments description.

    Config parameters:

    * **num_audio_features** (int) --- number of audio features to extract.
    * **input_type** (str) --- could be either "spectrogram" or "mfcc".
    * **vocab_file** (str) --- path to vocabulary file.
    * **dataset_files** (list) --- list with paths to all dataset .csv files.
    * **augmentation** (dict) --- optional dictionary with data augmentation
      parameters. Can contain "time_stretch_ratio", "noise_level_min" and
      "noise_level_max" parameters, e.g.::
        {
          'time_stretch_ratio': 0.05,
          'noise_level_min': -90,
          'noise_level_max': -60,
        }
      For additional details on these parameters see
      :func:`data.speech2text.speech_utils.augment_audio_signal` function.
    """
        super(Speech2TextDataLayer, self).__init__(params, model, num_workers,
                                                   worker_id)

        self.params['char2idx'] = load_pre_existing_vocabulary(
            self.params['vocab_file'],
            read_chars=True,
        )
        self.params['idx2char'] = {
            i: w
            for w, i in self.params['char2idx'].items()
        }
        # add one for implied blank token
        self.params['tgt_vocab_size'] = len(self.params['char2idx']) + 1

        self._files = None
        for csv in params['dataset_files']:
            files = pd.read_csv(csv, encoding='utf-8')
            if self._files is None:
                self._files = files
            else:
                self._files = self._files.append(files)

        if self.params['mode'] != 'infer':
            cols = ['wav_filename', 'transcript']
        else:
            cols = 'wav_filename'

        self.all_files = self._files.loc[:, cols].values
        self._files = self.split_data(self.all_files)

        self._size = self.get_size_in_samples()
        self._dataset = None
        self._iterator = None
        self._input_tensors = None
示例#4
0
  def __init__(self, params, model, num_workers=None, worker_id=None):
    """Speech-to-text data layer constructor.

    See parent class for arguments description.

    Config parameters:

    * **num_audio_features** (int) --- number of audio features to extract.
    * **input_type** (str) --- could be either "spectrogram" or "mfcc".
    * **vocab_file** (str) --- path to vocabulary file.
    * **dataset_files** (list) --- list with paths to all dataset .csv files.
    * **augmentation** (dict) --- optional dictionary with data augmentation
      parameters. Can contain "time_stretch_ratio", "noise_level_min" and
      "noise_level_max" parameters, e.g.::
        {
          'time_stretch_ratio': 0.05,
          'noise_level_min': -90,
          'noise_level_max': -60,
        }
      For additional details on these parameters see
      :func:`data.speech2text.speech_utils.augment_audio_signal` function.
    """
    super(Speech2TextDataLayer, self).__init__(params, model,
                                               num_workers, worker_id)

    self.params['char2idx'] = load_pre_existing_vocabulary(
      self.params['vocab_file'], read_chars=True,
    )
    self.params['idx2char'] = {i: w for w, i in self.params['char2idx'].items()}
    # add one for implied blank token
    self.params['tgt_vocab_size'] = len(self.params['char2idx']) + 1

    self._files = None
    for csv in params['dataset_files']:
      files = pd.read_csv(csv, encoding='utf-8')
      if self._files is None:
        self._files = files
      else:
        self._files = self._files.append(files)

    if self.params['mode'] != 'infer':
      cols = ['wav_filename', 'transcript']
    else:
      cols = 'wav_filename'

    self.all_files = self._files.loc[:, cols].values
    self._files = self.split_data(self.all_files)

    self._size = self.get_size_in_samples()
    self._dataset = None
    self._iterator = None
    self._input_tensors = None
示例#5
0
    def __init__(self, params, model, num_workers, worker_id):
        """Speech-to-text data layer constructor.
        See parent class for arguments description.
        Config parameters:
        * **backend** (str) --- audio pre-processing backend
          ('psf' [default] or librosa [recommended]).
        * **num_audio_features** (int) --- number of audio features to extract.
        * **input_type** (str) --- could be either "spectrogram" or "mfcc".
        * **vocab_file** (str) --- path to vocabulary file or sentencepiece model.
        * **dataset_files** (list) --- list with paths to all dataset .csv files.
        * **augmentation** (dict) --- optional dictionary with data augmentation
          parameters. Can contain "speed_perturbation_ratio", "noise_level_min" and
          "noise_level_max" parameters, e.g.::
            {
              'speed_perturbation_ratio': 0.05,
              'noise_level_min': -90,
              'noise_level_max': -60,
            }
          For additional details on these parameters see
          :func:`data.speech2text.speech_utils.augment_audio_signal` function.
        * **pad_to** (int) --- align audio sequence length to pad_to value.
        * **max_duration** (float) --- drop all samples longer than
          **max_duration** (seconds)
        * **min_duration** (float) --- drop all samples shorter than
          **min_duration** (seconds)
        * **bpe** (bool) --- use BPE encodings
        * **autoregressive** (bool) --- boolean indicating whether the model is
          autoregressive.
        * **syn_enable** (bool) --- boolean indicating whether the model is
          using synthetic data.
        * **syn_subdirs** (list) --- must be defined if using synthetic mode.
          Contains a list of subdirectories that hold the synthetica wav files.
        * **window_size** (float) --- window's duration (in seconds)
        * **window_stride** (float) --- window's stride (in seconds)
        * **dither** (float) --- weight of Gaussian noise to apply to input signal
          for dithering/preventing quantization noise
        * **num_fft** (int) --- size of fft window to use if features require fft,
              defaults to smallest power of 2 larger than window size
        * **norm_per_feature** (bool) --- if True, the output features will be
          normalized (whitened) individually. if False, a global mean/std over all
          features will be used for normalization.
        * **window** (str) --- window function to apply before FFT
          ('hanning', 'hamming', 'none')
        * **num_fft** (int) --- optional FFT size
        * **precompute_mel_basis** (bool) --- compute and store mel basis. If False,
          it will compute it for every get_speech_features call. Default: False
        * **sample_freq** (int) --- required for precompute_mel_basis
        """
        super(Speech2TextDataLayer, self).__init__(params, model,
                                                   num_workers, worker_id)

        self.params['autoregressive'] = self.params.get('autoregressive', False)
        self.autoregressive = self.params['autoregressive']
        self.params['bpe'] = self.params.get('bpe', False)
        if self.params['bpe']:
            self.sp = spm.SentencePieceProcessor()
            self.sp.Load(self.params['vocab_file'])
            self.params['tgt_vocab_size'] = len(self.sp) + 1
        else:
            self.params['char2idx'] = load_pre_existing_vocabulary(
                self.params['vocab_file'], read_chars=True,
            )
            if not self.autoregressive:
                # add one for implied blank token
                self.params['tgt_vocab_size'] = len(self.params['char2idx']) + 1
            else:
                num_chars_orig = len(self.params['char2idx'])
                self.params['tgt_vocab_size'] = num_chars_orig + 2
                self.start_index = num_chars_orig
                self.end_index = num_chars_orig + 1
                self.params['char2idx']['<S>'] = self.start_index
                self.params['char2idx']['</S>'] = self.end_index
                self.target_pad_value = self.end_index
            self.params['idx2char'] = {i: w for w,
                                       i in self.params['char2idx'].items()}
        self.target_pad_value = 0

        self._files = None
        if self.params["interactive"]:
            return
        for csv in params['dataset_files']:
            files = pd.read_csv(csv, encoding='utf-8')
            files.dropna(subset=["transcript"], inplace=True)
            if self._files is None:
                self._files = files
            else:
                self._files = self._files.append(files)

        if self.params['mode'] != 'infer':
            cols = ['wav_filename', 'transcript']
        else:
            cols = 'wav_filename'

        self.all_files = self._files.loc[:, cols].values
        self._files = self.split_data(self.all_files)

        self._size = self.get_size_in_samples()
        self._dataset = None
        self._iterator = None
        self._input_tensors = None

        self.params['min_duration'] = self.params.get('min_duration', -1.0)
        self.params['max_duration'] = self.params.get('max_duration', -1.0)
        self.params['window_size'] = self.params.get('window_size', 20e-3)
        self.params['window_stride'] = self.params.get('window_stride', 10e-3)
        self.params['sample_freq'] = self.params.get('sample_freq', 16000)

        mel_basis = None
        if (self.params.get("precompute_mel_basis", False) and
                self.params["input_type"] == "logfbank"):
            num_fft = (
                self.params.get("num_fft", None) or
                2**math.ceil(math.log2(
                    self.params['window_size']*self.params['sample_freq'])
                )
            )
            mel_basis = librosa.filters.mel(
                self.params['sample_freq'],
                num_fft,
                n_mels=self.params['num_audio_features'],
                fmin=0,
                fmax=int(self.params['sample_freq']/2)
            )
        self.params['mel_basis'] = mel_basis

        if 'n_freq_mask' in self.params.get('augmentation', {}):
            width_freq_mask = self.params['augmentation'].get('width_freq_mask', 10)
            if width_freq_mask > self.params['num_audio_features']:
                raise ValueError(
                    "'width_freq_mask'={} should be smaller ".format(width_freq_mask) +
                    "than 'num_audio_features'={}".format(
                        self.params['num_audio_features']
                    )
                )

        if 'time_stretch_ratio' in self.params.get('augmentation', {}):
            print("WARNING: Please update time_stretch_ratio to speed_perturbation_ratio")
            self.params['augmentation']['speed_perturbation_ratio'] = self.params['augmentation']['time_stretch_ratio']
示例#6
0
  def __init__(self, params, model, num_workers, worker_id):
    """Speech-to-text data layer constructor.
    See parent class for arguments description.
    Config parameters:
    * **num_audio_features** (int) --- number of audio features to extract.
    * **input_type** (str) --- could be either "spectrogram" or "mfcc".
    * **vocab_file** (str) --- path to vocabulary file or sentencepiece model.
    * **dataset_files** (list) --- list with paths to all dataset .csv files.
    * **augmentation** (dict) --- optional dictionary with data augmentation
      parameters. Can contain "time_stretch_ratio", "noise_level_min" and
      "noise_level_max" parameters, e.g.::
        {
          'time_stretch_ratio': 0.05,
          'noise_level_min': -90,
          'noise_level_max': -60,
        }
      For additional details on these parameters see
      :func:`data.speech2text.speech_utils.augment_audio_signal` function.
    * **autoregressive** (bool) --- boolean indicating whether the model is
      autoregressive.
    * **syn_enable** (bool) --- boolean indicating whether the model is
      using synthetic data.
    * **syn_subdirs** (list) --- must be defined if using synthetic mode.
      Contains a list of subdirectories that hold the synthetica wav files.
    """
    super(Speech2TextDataLayer, self).__init__(params, model,
                                               num_workers, worker_id)

    self.params['autoregressive'] = self.params.get('autoregressive', False)
    self.autoregressive = self.params['autoregressive']
    self.params['bpe'] = self.params.get('bpe', False)
    if self.params['bpe']:
      self.sp = spm.SentencePieceProcessor()
      self.sp.Load(self.params['vocab_file'])
      self.params['tgt_vocab_size'] = len(self.sp) + 1
    else:
      self.params['char2idx'] = load_pre_existing_vocabulary(
          self.params['vocab_file'], read_chars=True,
      )
      if not self.autoregressive:
        # add one for implied blank token
        self.params['tgt_vocab_size'] = len(self.params['char2idx']) + 1
      else:
        num_chars_orig = len(self.params['char2idx'])
        self.params['tgt_vocab_size'] = num_chars_orig + 2
        self.start_index = num_chars_orig
        self.end_index = num_chars_orig + 1
        self.params['char2idx']['<S>'] = self.start_index
        self.params['char2idx']['</S>'] = self.end_index
        self.target_pad_value = self.end_index
      self.params['idx2char'] = {i: w for w,
                                 i in self.params['char2idx'].items()}
    self.target_pad_value = 0

    self._files = None
    if self.params["interactive"]:
      return
    for csv in params['dataset_files']:
      files = pd.read_csv(csv, encoding='utf-8')
      if self._files is None:
        self._files = files
      else:
        self._files = self._files.append(files)

    if self.params['mode'] != 'infer':
      cols = ['wav_filename', 'transcript']
    else:
      cols = 'wav_filename'

    self.all_files = self._files.loc[:, cols].values
    self._files = self.split_data(self.all_files)

    self._size = self.get_size_in_samples()
    self._dataset = None
    self._iterator = None
    self._input_tensors = None

    self.params['max_duration'] = params.get('max_duration', -1.0)
    self.params['window_size'] = params.get('window_size', 20e-3)
    self.params['window_stride'] = params.get('window_stride', 10e-3)
示例#7
0
    def __init__(self, params, model, num_workers=None, worker_id=None):
        """Text-to-speech data layer constructor.

    See parent class for arguments description.

    Config parameters:

    * **dataset** (str) --- The dataset to use. Currently 'LJ' for the LJSpeech
      1.1 dataset is supported.
    * **num_audio_features** (int) --- number of audio features to extract.
    * **output_type** (str) --- could be either "magnitude", or "mel".
    * **vocab_file** (str) --- path to vocabulary file.
    * **dataset_files** (list) --- list with paths to all dataset .csv files.
      File is assumed to be separated by "|".
    * **dataset_location** (string) --- string with path to directory where wavs
      are stored.
    * **feature_normalize** (bool) --- whether to normlize the data with a
      preset mean and std
    * **feature_normalize_mean** (bool) --- used for feature normalize.
      Defaults to 0.
    * **feature_normalize_std** (bool) --- used for feature normalize.
      Defaults to 1.
    * **mag_power** (int) --- the power to which the magnitude spectrogram is
      scaled to. Defaults to 1.
      1 for energy spectrogram
      2 for power spectrogram
      Defaults to 2.
    * **pad_EOS** (bool) --- whether to apply EOS tokens to both the text and
      the speech signal. Will pad at least 1 token regardless of pad_to value.
      Defaults to True.
    * **pad_value** (float) --- The value we pad the spectrogram with. Defaults
      to np.log(data_min).
    * **pad_to** (int) --- we pad such that the resulting datapoint is a
      multiple of pad_to.
      Defaults to 8.
    * **trim** (bool) --- Whether to trim silence via librosa or not. Defaults
      to False.
    * **data_min** (float) --- min clip value prior to taking the log. Defaults
      to 1e-5. Please change to 1e-2 if using htk mels.
    * **duration_min** (int) --- Minimum duration in steps for speech signal.
      All signals less than this will be cut from the training set. Defaults to
      0.
    * **duration_max** (int) --- Maximum duration in steps for speech signal.
      All signals greater than this will be cut from the training set. Defaults 
      to 4000.
    * **mel_type** (str) --- One of ['slaney', 'htk']. Decides which algorithm to
      use to compute mel specs.
      Defaults to htk.
    * **style_input** (str) --- Can be either None or "wav". Must be set to "wav"
      for GST. Defaults to None.

    """
        super(Text2SpeechDataLayer, self).__init__(params, model, num_workers,
                                                   worker_id)

        names = ['wav_filename', 'raw_transcript', 'transcript']
        sep = '\x7c'
        header = None

        if self.params["dataset"] == "LJ":
            self._sampling_rate = 22050
            self._n_fft = 1024
        elif self.params["dataset"] == "MAILABS":
            self._sampling_rate = 16000
            self._n_fft = 800

        # Character level vocab
        self.params['char2idx'] = load_pre_existing_vocabulary(
            self.params['vocab_file'],
            min_idx=3,
            read_chars=True,
        )
        # Add the pad, start, and end chars
        self.params['char2idx']['<p>'] = 0
        self.params['char2idx']['<s>'] = 1
        self.params['char2idx']['</s>'] = 2
        self.params['idx2char'] = {
            i: w
            for w, i in self.params['char2idx'].items()
        }
        self.params['src_vocab_size'] = len(self.params['char2idx'])

        n_feats = self.params['num_audio_features']
        if "both" in self.params["output_type"]:
            self._both = True
            if self.params["feature_normalize"]:
                raise ValueError(
                    "feature normalize is not currently enabled for both mode")
            if not isinstance(n_feats, dict):
                raise ValueError(
                    "num_audio_features must be a dictionary for both mode")
            else:
                if ("mel" not in n_feats and "magnitude" not in n_feats):
                    raise ValueError(
                        "num_audio_features must contain mel and magnitude keys"
                    )
                elif (not isinstance(n_feats["mel"], int)
                      or not isinstance(n_feats["magnitude"], int)):
                    raise ValueError("num_audio_features must be a int")
            n_mels = n_feats['mel']
            data_min = self.params.get("data_min", None)
            if data_min is not None:
                if not isinstance(data_min, dict):
                    raise ValueError(
                        "data_min must be a dictionary for both mode")
                else:
                    if "mel" not in data_min and "magnitude" not in data_min:
                        raise ValueError(
                            "data_min must contain mel and magnitude keys")
                    elif (not isinstance(data_min["mel"], float)
                          or not isinstance(data_min["magnitude"], float)):
                        raise ValueError("data_min must be a float")
            self._exp_mag = self.params.get("exp_mag", True)
        else:
            if not isinstance(n_feats, int):
                raise ValueError(
                    "num_audio_features must be a float for mel or magnitude mode"
                )
            if not isinstance(self.params.get("data_min", 1.0), float):
                raise ValueError(
                    "data_min must be a float for mel or magnitude mode")
            self._both = False
            self._exp_mag = False
            n_mels = n_feats

        self._mel = "mel" in self.params["output_type"]

        if self._mel or self._both:
            htk = True
            norm = None
            if self.params.get('mel_type', 'htk') == 'slaney':
                htk = False
                norm = 1
            self._mel_basis = librosa.filters.mel(sr=self._sampling_rate,
                                                  n_fft=self._n_fft,
                                                  n_mels=n_mels,
                                                  htk=htk,
                                                  norm=norm)
        else:
            self._mel_basis = None

        if self.params["interactive"]:
            return

        # Load csv files
        self._files = None
        for csvs in params['dataset_files']:
            files = pd.read_csv(csvs,
                                encoding='utf-8',
                                sep=sep,
                                header=header,
                                names=names,
                                quoting=3)
            if self._files is None:
                self._files = files
            else:
                self._files = self._files.append(files)

        if (self.params['mode'] != 'infer'
                or self.params.get("style_input", None) == "wav"):
            cols = ['wav_filename', 'transcript']
        else:
            cols = 'transcript'

        all_files = self._files.loc[:, cols].values
        self._files = self.split_data(all_files)

        self._size = self.get_size_in_samples()
        self._dataset = None
        self._iterator = None
        self._input_tensors = None
示例#8
0
  def __init__(self, params, model, num_workers=1, worker_id=0):
    super(ParallelTextDataLayer, self).__init__(params, model,
                                                num_workers, worker_id)
    self._batch_size = self.params['batch_size']
    self.source_file = self.params['source_file']
    self._use_targets = self.params.get('use_targets', True)
    if not self._use_targets:
      self.target_file = self.source_file
      if 'target_file' in self.params:
        print("WARNING: target file was specified but was "
              "ignored by data layer because 'use_targets'=False")
    else:
      self.target_file = self.params['target_file']
    self.src_vocab_file = self.params['src_vocab_file']
    self.tgt_vocab_file = self.params['tgt_vocab_file']
    self.max_len = self.params['max_length']
    self._delimiter = self.params.get('delimiter', ' ')
    self._map_parallel_calls = self.params.get('map_parallel_calls', 8)
    self._pad_lengths_to_eight = self.params.get('pad_lengths_to_eight', False)
    self._prefetch_buffer_size = self.params.get('prefetch_buffer_size',
                                                 tf.contrib.data.AUTOTUNE)
    self._shuffle_buffer_size = self.params.get('shuffle_buffer_size', -1)
    self._num_workers = num_workers
    self._worker_id = worker_id
    self._use_start_token = self.params.get('use_start_token', True)
    if self._pad_lengths_to_eight and not (self.params['max_length'] % 8 == 0):
      raise ValueError("If padding to 8 in data layer, then "
                       "max_length should be multiple of 8")

    def file_len(fname):
      with open(fname,encoding="utf-8") as f:
        for i, l in enumerate(f):
          pass
      return i + 1

    self.dataset_size = file_len(self.source_file)
    special_tokens_already_in_vocab = self.params.get('special_tokens_already_in_vocab', True)

    # load source and target vocabularies to RAM
    self.src_seq2idx = load_pre_existing_vocabulary(
      self.src_vocab_file, min_idx=0 if special_tokens_already_in_vocab
      else SpecialTextTokens.UNK_ID.value + 1)
    self.tgt_seq2idx = load_pre_existing_vocabulary(
      self.tgt_vocab_file, min_idx=0 if special_tokens_already_in_vocab
      else SpecialTextTokens.UNK_ID.value + 1)

    if not special_tokens_already_in_vocab:
      # manually add special tokens
      # unknown symbol
      self.src_seq2idx[
        SpecialTextTokens.to_string(SpecialTextTokens.UNK_ID.value)] = \
        SpecialTextTokens.UNK_ID.value
      self.tgt_seq2idx[
        SpecialTextTokens.to_string(SpecialTextTokens.UNK_ID.value)] = \
        SpecialTextTokens.UNK_ID.value
      # sentence start
      self.src_seq2idx[
        SpecialTextTokens.to_string(SpecialTextTokens.S_ID.value)] = \
        SpecialTextTokens.S_ID.value
      self.tgt_seq2idx[
        SpecialTextTokens.to_string(SpecialTextTokens.S_ID.value)] = \
        SpecialTextTokens.S_ID.value
      # sentence end
      self.src_seq2idx[
        SpecialTextTokens.to_string(SpecialTextTokens.EOS_ID.value)] = \
        SpecialTextTokens.EOS_ID.value
      self.tgt_seq2idx[
        SpecialTextTokens.to_string(SpecialTextTokens.EOS_ID.value)] = \
        SpecialTextTokens.EOS_ID.value
      # padding
      self.src_seq2idx[
        SpecialTextTokens.to_string(SpecialTextTokens.PAD_ID.value)] = \
        SpecialTextTokens.PAD_ID.value
      self.tgt_seq2idx[
        SpecialTextTokens.to_string(SpecialTextTokens.PAD_ID.value)] = \
        SpecialTextTokens.PAD_ID.value

    if self.params.get('pad_vocab_to_eight', False):
      self.src_seq2idx = pad_vocab_to_eight(self.src_seq2idx)
      self.tgt_seq2idx = pad_vocab_to_eight(self.tgt_seq2idx)

    self.src_idx2seq = {idx: w for w, idx in self.src_seq2idx.items()}
    self.tgt_idx2seq = {idx: w for w, idx in self.tgt_seq2idx.items()}

    self.params['src_vocab_size'] = len(self.src_seq2idx)
    self.params['tgt_vocab_size'] = len(self.tgt_seq2idx)
    self.params['target_seq2idx'] = self.tgt_seq2idx
    self.params['source_seq2idx'] = self.src_seq2idx
    self.params['target_idx2seq'] = self.tgt_idx2seq
    self.params['source_idx2seq'] = self.src_idx2seq

    self._input_tensors = {}
示例#9
0
    def __init__(self, params, model, num_workers=None, worker_id=None):
        """Text-to-speech data layer constructor.

    See parent class for arguments description.

    Config parameters:

    * **num_audio_features** (int) --- number of audio features to extract.
    * **output_type** (str) --- could be either "magnitude", or "mel".
    * **vocab_file** (str) --- path to vocabulary file.
    * **dataset_files** (list) --- list with paths to all dataset .csv files.
      File is assumed to be separated by "|".
    * **dataset_location** (string) --- string with path to directory where wavs
      are stored.
    * **feature_normalize** (bool) --- whether to normlize the data with a
      preset mean and std
    * **feature_normalize_mean** (bool) --- used for feature normalize.
      Defaults to 0.
    * **feature_normalize_std** (bool) --- used for feature normalize.
      Defaults to 1.
    * **mag_power** (int) --- the power to which the magnitude spectrogram is
      scaled to:
      1 for energy spectrogram
      2 for power spectrogram
      Defaults to 2.
    * **pad_EOS** (bool) --- whether to apply EOS tokens to both the text and
      the speech signal. Will pad at least 1 token regardless of pad_to value.
      Defaults to True.
    * **pad_to** (int) --- we pad such that the resulting datapoint is a
      multiple of pad_to.
      Defaults to 8.

    """
        super(Text2SpeechDataLayer, self).__init__(params, model, num_workers,
                                                   worker_id)
        # Character level vocab
        self.params['char2idx'] = load_pre_existing_vocabulary(
            self.params['vocab_file'],
            read_chars=True,
        )
        self.params['idx2char'] = {
            i: w
            for w, i in self.params['char2idx'].items()
        }
        # add one for implied blank token
        self.params['src_vocab_size'] = len(self.params['char2idx']) + 1

        # This assumes that the LJSpeech dataset is used
        if "mel" in self.params["output_type"]:
            self._mel = True
            self._mel_basis = librosa.filters.mel(
                sr=22050, n_fft=1024, n_mels=self.params['num_audio_features'])
        else:
            self._mel = False

        # The rest of the code is not needed for interactive infer
        if self.params["interactive"]:
            return

        if "dataset_files" not in self.params:
            raise ValueError("dataset_files parameter has to be specified")

        names = ['wav_filename', 'transcript', 'transcript_normalized']

        if "disk" in self.params["output_type"]:
            self._load_from_disk = True
        else:
            self._load_from_disk = False

        # Load csv files
        self._files = None
        for csvs in params['dataset_files']:
            files = pd.read_csv(csvs,
                                encoding='utf-8',
                                sep='\x7c',
                                header=None,
                                names=names,
                                quoting=3)
            if self._files is None:
                self._files = files
            else:
                self._files = self._files.append(files)

        if self.params['mode'] != 'infer':
            cols = ['wav_filename', 'transcript_normalized']
        else:
            cols = 'transcript_normalized'

        all_files = self._files.loc[:, cols].values
        self._files = self.split_data(all_files)

        self._size = self.get_size_in_samples()
        self._dataset = None
        self._iterator = None
        self._input_tensors = None
示例#10
0
  def __init__(self, params, model, num_workers=1, worker_id=0):
    super(ParallelTextDataLayer, self).__init__(params, model,
                                                num_workers, worker_id)
    self._batch_size = self.params['batch_size']
    self.source_file = self.params['source_file']
    self._use_targets = self.params.get('use_targets', True)
    if not self._use_targets:
      self.target_file = self.source_file
      if 'target_file' in self.params:
        print("WARNING: target file was specified but was "
              "ignored by data layer because 'use_targets'=False")
    else:
      self.target_file = self.params['target_file']
    self.src_vocab_file = self.params['src_vocab_file']
    self.tgt_vocab_file = self.params['tgt_vocab_file']
    self.max_len = self.params['max_length']
    self._delimiter = self.params.get('delimiter', ' ')
    self._map_parallel_calls = self.params.get('map_parallel_calls', 8)
    self._pad_lengths_to_eight = self.params.get('pad_lengths_to_eight', False)
    self._prefetch_buffer_size = self.params.get('prefetch_buffer_size', 4)
    self._num_workers = num_workers
    self._worker_id = worker_id
    if self._pad_lengths_to_eight and not (self.params['max_length'] % 8 == 0):
      raise ValueError("If padding to 8 in data layer, then "
                       "max_length should be multiple of 8")

    def file_len(fname):
      with open(fname) as f:
        for i, l in enumerate(f):
          pass
      return i + 1

    self.dataset_size = file_len(self.source_file)

    # load source and target vocabularies to RAM
    self.src_seq2idx = load_pre_existing_vocabulary(
      self.src_vocab_file,
      min_idx=SpecialTextTokens.UNK_ID.value + 1)
    self.tgt_seq2idx = load_pre_existing_vocabulary(
      self.tgt_vocab_file,
      min_idx=SpecialTextTokens.UNK_ID.value + 1)

    # unknown symbol
    self.src_seq2idx[
      SpecialTextTokens.to_string(SpecialTextTokens.UNK_ID.value)] = \
      SpecialTextTokens.UNK_ID.value
    self.tgt_seq2idx[
      SpecialTextTokens.to_string(SpecialTextTokens.UNK_ID.value)] = \
      SpecialTextTokens.UNK_ID.value

    # sentence start
    self.src_seq2idx[
      SpecialTextTokens.to_string(SpecialTextTokens.S_ID.value)] = \
      SpecialTextTokens.S_ID.value
    self.tgt_seq2idx[
      SpecialTextTokens.to_string(SpecialTextTokens.S_ID.value)] = \
      SpecialTextTokens.S_ID.value
    # sentence end
    self.src_seq2idx[
      SpecialTextTokens.to_string(SpecialTextTokens.EOS_ID.value)] = \
      SpecialTextTokens.EOS_ID.value
    self.tgt_seq2idx[
      SpecialTextTokens.to_string(SpecialTextTokens.EOS_ID.value)] = \
      SpecialTextTokens.EOS_ID.value
    # padding
    self.src_seq2idx[
      SpecialTextTokens.to_string(SpecialTextTokens.PAD_ID.value)] = \
      SpecialTextTokens.PAD_ID.value
    self.tgt_seq2idx[
      SpecialTextTokens.to_string(SpecialTextTokens.PAD_ID.value)] = \
      SpecialTextTokens.PAD_ID.value

    if self.params.get('pad_vocab_to_eight', False):
      self.src_seq2idx = pad_vocab_to_eight(self.src_seq2idx)
      self.tgt_seq2idx = pad_vocab_to_eight(self.tgt_seq2idx)

    self.src_idx2seq = {idx: w for w, idx in self.src_seq2idx.items()}
    self.tgt_idx2seq = {idx: w for w, idx in self.tgt_seq2idx.items()}

    self.params['src_vocab_size'] = len(self.src_seq2idx)
    self.params['tgt_vocab_size'] = len(self.tgt_seq2idx)
    self.params['target_seq2idx'] = self.tgt_seq2idx
    self.params['source_seq2idx'] = self.src_seq2idx
    self.params['target_idx2seq'] = self.tgt_idx2seq
    self.params['source_idx2seq'] = self.src_idx2seq

    self._input_tensors = {}
示例#11
0
    def __init__(self, params, model, num_workers=None, worker_id=None):
        super(Text2SpeechDataLayer, self).__init__(params, model, num_workers,
                                                   worker_id)

        self.use_cache = self.params.get('use_cache', False)
        self._cache = {}

        names = ['mel_file', 'transcript', "fileid"]
        sep = '\x7c'
        header = None

        # Character level vocab
        self.params['char2idx'] = load_pre_existing_vocabulary(
            self.params['vocab_file'],
            min_idx=3,
            read_chars=True,
        )
        # Add the pad, start, and end chars
        self.params['char2idx']['<p>'] = 0
        self.params['char2idx']['<s>'] = 1
        self.params['char2idx']['</s>'] = 2
        self.params['idx2char'] = {
            i: w
            for w, i in self.params['char2idx'].items()
        }
        self.params['src_vocab_size'] = len(self.params['char2idx'])

        # Load csv files
        self._files = None
        for csvs in params['dataset_files']:
            files = pd.read_csv(csvs,
                                encoding='utf-8',
                                sep=sep,
                                header=header,
                                names=names,
                                quoting=3)
            if self._files is None:
                self._files = files
            else:
                self._files = self._files.append(files)

        if self.params['mode'] == 'train' and 'n_samples_train' in self.params:
            indices = self._files['transcript'].str.len().sort_values().index
            self._files = self._files.reindex(indices)

            n_samples = self.params.get('n_samples_train')
            print('Using just the {} shortest samples'.format(n_samples))
            self._files = self._files.iloc[:n_samples]

        if self.params['mode'] == 'eval':
            indices = self._files['transcript'].str.len().sort_values().index
            self._files = self._files.reindex(indices)

            if 'n_samples_eval' in self.params:
                n_samples = self.params['n_samples_eval']
                self._files = self._files.iloc[:n_samples]

        cols = ['mel_file', 'transcript', "fileid"]

        all_files = self._files.loc[:, cols].values
        all_files = [list(map(str, i)) for i in all_files]
        self._files = self.split_data(all_files)

        self._size = self.get_size_in_samples()
        self._dataset = None
        self._iterator = None
        self._input_tensors = None