示例#1
0
def _process_utterance(out_dir, index, tar_cd_path, in_jd_path, in_cg_path):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

    # Load the audio to a numpy array:
    tar_cd_wav = audio.load_wav(tar_cd_path)

    # Compute the linear-scale spectrogram from the wav:
    tar_cd_spectrogram = audio.spectrogram(tar_cd_wav).astype(np.float32)
    n_frames = tar_cd_spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    tar_cd_mel_spectrogram = audio.melspectrogram(tar_cd_wav).astype(
        np.float32)

    in_jd_wav = audio.load_wav(in_jd_path)
    in_cg_wav = audio.load_wav(in_cg_path)

    # Compute the linear-scale spectrogram from the wav:
    # Beacase of use voice traing,needless spectrogram.
    #in_spectrogram = audio.spectrogram(in_cg_wav).astype(np.float32)

    # Compute the mel-scale spectrogram from the wav:
    in_jd_mel_spectrogram = audio.melspectrogram(in_jd_wav).astype(np.float32)
    in_cg_mel_spectrogram = audio.melspectrogram(in_cg_wav).astype(np.float32)

    # Write the spectrograms to disk:
    in_jd_mel_spectrogram_filename = 'Imuspeech-in_jd_mel_spec-%05d.npy' % index
    in_cg_mel_spectrogram_filename = 'Imuspeech-in_cg_mel_spec-%05d.npy' % index
    tar_cd_spectrogram_filename = 'Imuspeech-tar_cd_spec-%05d.npy' % index
    tar_cd_mel_filename = 'Imuspeech-tar_cd_mel-%05d.npy' % index

    np.save(os.path.join(out_dir, in_jd_mel_spectrogram_filename),
            in_jd_mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, in_cg_mel_spectrogram_filename),
            in_cg_mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, tar_cd_spectrogram_filename),
            tar_cd_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, tar_cd_mel_filename),
            tar_cd_mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (tar_cd_spectrogram_filename, tar_cd_mel_filename, n_frames,
            in_jd_mel_spectrogram_filename, in_cg_mel_spectrogram_filename)
示例#2
0
文件: wav2wav.py 项目: meppe/tacotron
def _process_utterance(out_dir, index, src_path, tgt_path):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    src_path: Path to the source audio file
    tgt_path: Path to the target audio file

  Returns:
    A (tgt_spectrogram_filename, tgt_mel_filename, n_frames, src_spectogram_filename) tuple to write to train.txt
  '''

    # Load the audio to a numpy array:
    src_wav = audio.load_wav(src_path)
    tgt_wav = audio.load_wav(tgt_path)

    # Compute the linear-scale spectrogram from the wav:
    src_spectrogram = audio.spectrogram(
        src_wav,
        num_src_freq=hparams.num_src_freq,
        frame_length_ms=hparams.src_frame_length_ms).astype(np.float32)
    src_n_frames = src_spectrogram.shape[1]
    tgt_spectrogram = audio.spectrogram(tgt_wav).astype(np.float32)
    tgt_n_frames = tgt_spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    src_mel_spectrogram = audio.melspectrogram(src_wav).astype(np.float32)
    tgt_mel_spectrogram = audio.melspectrogram(tgt_wav).astype(np.float32)

    # Write the spectrograms to disk:
    src_spectrogram_filename = 'wav2wav_src-spec-%05d.npy' % index
    src_mel_filename = 'wav2wav_src-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, src_spectrogram_filename),
            src_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, src_mel_filename),
            src_mel_spectrogram.T,
            allow_pickle=False)

    tgt_spectrogram_filename = 'wav2wav_tgt-spec-%05d.npy' % index
    tgt_mel_filename = 'wav2wav_tgt-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, tgt_spectrogram_filename),
            tgt_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, tgt_mel_filename),
            tgt_mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (tgt_spectrogram_filename, tgt_mel_filename, tgt_n_frames,
            src_spectrogram_filename)
def _process_utterance(out_dir, index, wav_path_neutral, wav_path_happy):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file

    Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''
    # Load the audio to a numpy array:
    wav1 = audio.load_wav(wav_path_neutral)
    wav2 = audio.load_wav(wav_path_happy)

    # Compute the neutral linear-scale spectrogram from the wav:
    spectrogram_neutral = audio.spectrogram(wav1).astype(np.float32)
    n_frames = spectrogram_neutral.shape[1]
    # Compute a neutral mel-scale spectrogram from the wav:
    mel_spectrogram_neutral = audio.melspectrogram(wav1).astype(np.float32)

    spectrogram_happy = audio.spectrogram(wav2).astype(np.float32)
    n_frames = spectrogram_happy.shape[1]
    mel_spectrogram_happy = audio.melspectrogram(wav2).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_neutral_filename = 'neutral-spec-%05d.npy' % index
    mel_neutral_filename = 'neutral-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_neutral_filename),
            spectrogram_neutral.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_neutral_filename),
            mel_spectrogram_neutral.T,
            allow_pickle=False)

    spectrogram_happy_filename = 'happy-spec-%05d.npy' % index
    mel_happy_filename = 'happy-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_happy_filename),
            spectrogram_happy.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_happy_filename),
            mel_spectrogram_happy.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_neutral_filename, mel_neutral_filename,
            spectrogram_happy_filename, mel_happy_filename, n_frames)
示例#4
0
def _process_utterance(out_dir, index, source_wav_path, target_wav_path):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

    # Load the audio to a numpy array:
    source_wav = audio.load_wav(source_wav_path)
    target_wav = audio.load_wav(target_wav_path)

    # Compute the linear-scale spectrogram from the wav:
    target_spectrogram = audio.spectrogram(target_wav).astype(np.float32)
    n_frames = target_spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    source_mel_spectrogram = audio.melspectrogram(source_wav).astype(
        np.float32)
    target_mel_spectrogram = audio.melspectrogram(target_wav).astype(
        np.float32)

    # Write the spectrograms to disk:
    #source_spectrogram_filename = 'source-spec-%05d.npy' % index
    source_mel_filename = 'source-mel-%05d.npy' % index
    target_spectrogram_filename = 'target-spec-%05d.npy' % index
    target_mel_filename = 'target-mel-%05d.npy' % index
    #np.save(os.path.join(out_dir, source_spectrogram_filename), source_spectrogram.T, allow_pickle=False)

    np.save(os.path.join(out_dir, source_mel_filename),
            source_mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, target_spectrogram_filename),
            target_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, target_mel_filename),
            target_mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (source_mel_filename, n_frames, target_spectrogram_filename,
            target_mel_filename)
示例#5
0
def run_eval(args):
  #print(hparams_debug_string())
  is_teacher_force = False
  reference_mel = None

  synth = Synthesizer(teacher_forcing_generating=is_teacher_force)
  synth.load(args.model, args.reference)
  base_path = get_output_base_path(args.model)

  if args.reference is not None:
    ref_wav = audio.load_wav(args.reference)
    reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T
    #path = '%s_ref-%s.wav' % (base_path, os.path.splitext(os.path.basename(args.reference))[0])
    path = 'ref-%s.wav' % (os.path.splitext(os.path.basename(args.reference))[0])
  else:
      raise ValueError("You must set the reference audio.")

  
  with open('examples_test.txt', 'r') as fs:
   
      lines = fs.readlines()
      for i, line in enumerate(lines):
          args.text = line.strip().split('|')[-1]          
          
          path_id = '%d_' %(i+6)
          new_path = path_id + path
          print('Synthesizing: %s' % args.text)
          print('Output wav file: %s' % new_path)
          
          with open(new_path, 'wb') as f:
            f.write(synth.synthesize(args.text, reference_mel=reference_mel))
示例#6
0
def _process_utterance(out_dir,
                       index,
                       wav_path,
                       labels_path,
                       text,
                       person_id=1):
    # Load the wav file and trim silence from the ends:
    wav = audio.load_wav(wav_path)
    start_offset, end_offset = _parse_labels(labels_path)
    start = int(start_offset * hparams.sample_rate)
    end = int(end_offset *
              hparams.sample_rate) if end_offset is not None else -1
    wav = wav[start:end]
    max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
    if len(wav) > max_samples:
        return None
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    spectrogram_filename = 'blizzard-spec-%05d.npy' % index
    mel_filename = 'blizzard-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    return (spectrogram_filename, mel_filename, n_frames, text, person_id)
示例#7
0
def _process_utterance(out_dir, index, wav_path, text):
  '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

  # Load the audio to a numpy array:
  wav = audio.load_wav(wav_path)

  # Compute the linear-scale spectrogram from the wav:
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]

  # Compute a mel-scale spectrogram from the wav:
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

  # Write the spectrograms to disk:
  spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
  mel_filename = 'ljspeech-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

  # Return a tuple describing this training example:
  return (spectrogram_filename, mel_filename, n_frames, text)
示例#8
0
def process_utterance(out_path, index, wav_path, text):
    '''
    generate linear and mel scale spectrograms for each text, wav pairs
    and save the np array into disk

    return the file names of the np array files
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index

    # .T: transpose of narray
    # allow_pickle: for security and portability not allow
    np.save(os.path.join(out_path, spectrogram_filename), spectrogram.T, allow_pickle=False)
    np.save(os.path.join(out_path, mel_filename), mel_spectrogram.T, allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
示例#9
0
def run_eval(args):
  print(hparams_debug_string())
  is_teacher_force = False
  mel_targets = args.mel_targets
  reference_mel = None
  if args.mel_targets is not None:
    is_teacher_force = True
    mel_targets = np.load(args.mel_targets)
  synth = Synthesizer(teacher_forcing_generating=is_teacher_force)
  synth.load(args.checkpoint, args.reference_audio)
  base_path = get_output_base_path(args.checkpoint)

  if args.reference_audio is not None:
    ref_wav = audio.load_wav(args.reference_audio)
    reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T
    path = '%s_ref-%s.wav' % (base_path, os.path.splitext(os.path.basename(args.reference_audio))[0])
  else:
    if hparams.use_gst:
      print("*******************************")
      print("TODO: add style weights when there is no reference audio. Now we use random weights, " + 
             "which may generate unintelligible audio sometimes.")
      print("*******************************")
      path = '%s_ref-randomWeight.wav' % (base_path)
    else:
      raise ValueError("You must set the reference audio if you don't want to use GSTs.")

  with open(path, 'wb') as f:
    print('Synthesizing: %s' % args.text)
    print('Output wav file: %s' % path)
    f.write(synth.synthesize(args.text, reference_mel=reference_mel))
示例#10
0
def _process_utterance(out_dir, prompt_id, wav_path, text):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Trim leading and trailing silence:
    margin = int(hparams.sample_rate * 0.1)
    wav = wav[margin:-margin]
    wav, _ = librosa.effects.trim(wav,
                                  top_db=40,
                                  frame_length=1024,
                                  hop_length=256)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'amy-spec-%s.npy' % prompt_id
    mel_filename = 'amy-mel-%s.npy' % prompt_id
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
示例#11
0
def _process_utterance(out_dir, index, wav_path, pinyin):
  '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    pinyin: The pinyin of Chinese spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

  # Load the audio to a numpy array:
  wav = audio.load_wav(wav_path)

  # Compute the linear-scale spectrogram from the wav:
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]

  # Compute a mel-scale spectrogram from the wav:
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

  # Write the spectrograms to disk:
  spectrogram_filename = 'femalemandarin-spec-%05d.npy' % index
  mel_filename = 'femalemandarin-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

  # Return a tuple describing this training example:
  return (spectrogram_filename, mel_filename, n_frames, pinyin)
示例#12
0
    def synthesize(self,
                   path_in,
                   path_re,
                   mel_targets=None,
                   reference_mel=None,
                   alignment_path=None):
        wav_in = audio.load_wav(path_in)
        wav_re = audio.load_wav(path_re)
        mel_in = audio.melspectrogram(wav_in).astype(np.float32)
        mel_re = audio.melspectrogram(wav_re).astype(np.float32)
        # print(mel_jp)
        feed_dict = {
            self.model.inputs: [mel_in.T],
            self.model.input_lengths: np.asarray([len(mel_in)],
                                                 dtype=np.int32),
            self.model.inputs_jp: [mel_re.T],
        }
        # if mel_targets is not None:
        #   mel_targets = np.expand_dims(mel_targets, 0)
        #   print(reference_mel.shapex)
        #   feed_dict.update({self.model.mel_targets: np.asarray(mel_targets, dtype=np.float32)})
        # if reference_mel is not None:
        #   reference_mel = np.expand_dims(reference_mel, 0)
        #   print(reference_mel.shapex)
        #   feed_dict.update({self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32)})

        wav_out, alignments = self.session.run(
            [self.wav_output, self.alignments], feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav_out)
        end_point = audio.find_endpoint(wav)
        wav = wav[:end_point]
        nowTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")  # 生成当前时间
        randomNum = random.randint(0, 100)  # 生成的随机整数n,其中0<=n<=100
        if randomNum <= 10:
            randomNum = str(0) + str(randomNum)
        uniqueNum = str(nowTime) + str(randomNum)
        out_dir = "static\\out\\" + uniqueNum + ".wav"
        out_name = uniqueNum + ".wav"

        audio.save_wav(wav, out_dir)
        out = io.BytesIO()
        audio.save_wav(wav, out)
        # n_frame = int(end_point / (hparams.frame_shift_ms / 1000* hparams.sample_rate)) + 1
        # plot.plot_alignment(alignments[:,:n_frame], alignment_path, info='%s' % (path))
        return out_dir, out_name
示例#13
0
def convert_file(audio_path):
    y = audio.load_wav(audio_path)
    peak = np.abs(y).max()
    if hp.peak_norm or peak > 1.0:
        y *= (0.9 / peak)

    linear = audio.spectrogram(y)
    mel = audio.melspectrogram(y)
    return mel.astype(np.float32), linear.astype(np.float32)
示例#14
0
def preprocess_utterance(wav_file,input_path, output_path):
    wav = audio.load_wav(wav_file)
    wav_path, name = os.path.split(wav_file)
    out_dir = wav_path.replace(input_path,output_path)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    mel_filename = name.replace('.wav','.npy')
    np.save(os.path.join(out_dir, mel_filename),mel_spectrogram.T,allow_pickle=False)
    print(mel_filename,mel_spectrogram.shape[1])
示例#15
0
def _process_utterance(out_dir, index, wav_path, text):
  wav = audio.load_wav(wav_path)
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
  spectrogram_filename = 'selvas-spec-%04d.npy' % int(index)
  mel_filename = 'selvas-mel-%04d.npy' % int(index)
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
  return (spectrogram_filename, mel_filename, n_frames, text)
示例#16
0
 def _extract_features_std(self, wav, text):
     wav_pre = audio.preemphasis(wav)
     linear_target = audio.spectrogram(wav_pre).astype(sp.float32)
     mel_target = audio.melspectrogram(wav_pre).astype(sp.float32)
     input_data = sp.asarray(text_to_sequence(str(text, encoding='utf8'),
                                              self._cleaner_names),
                             dtype=sp.int32)
     input_length = sp.int32(len(input_data))
     return input_data, [input_length], mel_target.T, linear_target.T, [
         sp.int32(len(linear_target.T))
     ]
示例#17
0
def run_eval(args):
  print(hparams_debug_string())
  synth = Synthesizer()
  synth.load(args.checkpoint)
  base_path = get_output_base_path(args.checkpoint)
  wav = load_wav(args.reference_audio)
  mel = melspectrogram(wav).transpose()
  for i, text in enumerate(sentences):
    path = '%s-%d.wav' % (base_path, i)
    print('Synthesizing: %s' % path)
    with open(path, 'wb') as f:
      f.write(synth.synthesize(text, mel))
示例#18
0
文件: ks.py 项目: linlinsongyun/vc-tf
def _process_utterance(out_dir, index, wav_path):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
        out_dir: The directory to write the spectrograms into
        index: The numeric index to use in the spectrogram filenames.
        wav_path: Path to the audio file containing the speech input
        text: The text spoken in the input audio file

    Returns:
        A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # cut or pad wav into 2s
    length = hparams.sample_rate * hparams.duration
    wav = librosa.util.fix_length(wav, length)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Computer mfcc
    # mfcc = audio.mfcc(wav).astype(np.float32)

    # Write the spectrograms to disk:
    wav_name = os.path.basename(wav_path)
    wav_name = wav_name.split('.')[0]
    spectrogram_filename = 'spec-%s.npy' % wav_name
    mel_filename = 'mel-%s.npy' % wav_name
    mfcc_filename = 'mfcc-%s.npy' % wav_name
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    # np.save(
    #     os.path.join(out_dir, mfcc_filename),
    #     mfcc.T,
    #     allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames)
示例#19
0
def _process_utterance(out_dir, index, wav_path, text):
  wav, _ = audio.load_wav(wav_path)

  spectrogram = audio.spectrogram(wav).astype(np.float32)  # (1025, frame)
  n_frames = spectrogram.shape[1]

  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)  # (80, frame)

  spectrogram_filename = 'kss-spec-%05d.npy' % index
  mel_filename = 'kss-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)  # (frame, 1025)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)  # (frame, 80)

  return (spectrogram_filename, mel_filename, n_frames, text)
示例#20
0
def _process_utterance(out_dir, index, wav_path, text, person_id):
    # Load the wav file and trim silence from the ends:
    wav = audio.load_wav(wav_path)
    #max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
    #if len(wav) > max_samples:
    #    return None
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    spectrogram_filename = 'arctic-spec-%05d.npy' % index
    mel_filename = 'arctic-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
    return (spectrogram_filename, mel_filename, n_frames, text, person_id)
示例#21
0
def _process_utterance(out_dir, name, wav_path, text):
    wav = audio.load_wav(wav_path)
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    spectrogram_filename = 'bznsyp-spec-%s.npy' % name
    mel_filename = 'bznsyp-mel-%s.npy' % name
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    #text = sentence_to_pinyin(text)
    return (spectrogram_filename, mel_filename, n_frames, text)
示例#22
0
    def synthesize(self, input_path):
        s, sr = sf.read(input_path)
        spec = audio.melspectrogram(s).astype(np.float32).T

        feed_dict = {
            self.model.inputs: [np.asarray(spec, dtype=np.float32)],
            self.model.input_lengths: np.asarray([spec.shape[0]],
                                                 dtype=np.int32)
        }
        wav = self.session.run(self.wav_output, feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav)
        wav = wav[:audio.find_endpoint(wav)]
        out = io.BytesIO()
        audio.save_wav(wav, out)
        return out.getvalue()
示例#23
0
def __generate_spectrograms(file_path, category, index, out_dir):
    wav = audio.load_wav(file_path)
    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    # Write the spectrograms to disk:
    spectrogram_filename = '{}spec{}.npy'.format(category, index)
    mel_filename = '{}mel{}.npy'.format(category, index)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
示例#24
0
def get_wav_linear_and_mel_targert(wav_path, set_spec_length=None):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    # Return a tuple describing this training example:
    if set_spec_length is not None:
        return (spectrogram.T[:set_spec_length],
                mel_spectrogram.T[:set_spec_length], n_frames)
    #wav = wav.reshape(-1, 1)
    #wav = np.pad(wav, [[2048, 0], [0, 0]], 'constant')
    #wav = np.pad(wav, [[2048, 0]], 'constant')
    return (wav, spectrogram.T, mel_spectrogram.T, n_frames)
示例#25
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

    # Load the audio to a numpy array:
    print('wave_path :', wav_path)
    wav = audio.load_wav(wav_path)
    print('wav :', wav.shape, 'sr:')

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    #print('spectrogram: ', spectrogram, '\nspectrogram,shape: ', spectrogram.shape)
    n_frames = spectrogram.shape[1]
    print('n_frames : ', n_frames)

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    #print('melspectrogram: ', mel_spectrogram, '\nspectrogram,shape: ', mel_spectrogram.shape)

    # Write the spectrograms to disk:
    spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    print('spectrogram_filename:', spectrogram_filename)
    print('mel_filename:', mel_filename)
    print('out_dir: ', out_dir)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
示例#26
0
def eval_step(sess, global_step, model, plot_dir, wav_dir, summary_writer, hparams, model_name):
    '''Evaluate model during training.
    Supposes that model variables are averaged.
    '''
    start_time = time.time()
    y_hat, y_target, loss, input_mel, upsampled_features = sess.run([model.tower_y_hat[0], model.tower_y_target[0],
                                                                     model.eval_loss, model.tower_eval_c[0],
                                                                     model.tower_eval_upsampled_local_features[0]])
    duration = time.time() - start_time
    log('Time Evaluation: Generation of {} audio frames took {:.3f} sec ({:.3f} frames/sec)'.format(
        len(y_target), duration, len(y_target) / duration))

    # Make audio and plot paths
    pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step))
    target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step))
    plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step))
    mel_path = os.path.join(plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step))
    upsampled_path = os.path.join(plot_dir, 'step-{}-upsampled-features.png'.format(global_step))

    # Save figure
    util.waveplot(plot_path, y_hat, y_target, model._hparams,
                  title='{}, {}, step={}, loss={:.5f}'.format(model_name, time_string(), global_step, loss))
    log('Eval loss for global step {}: {:.3f}'.format(global_step, loss))

    # Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
    # Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
    T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (
    0, hparams.max_abs_value)
    generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range)
    util.plot_spectrogram(generated_mel, mel_path,
                          title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'.format(
                              global_step, loss), target_spectrogram=input_mel.T)
    util.plot_spectrogram(upsampled_features.T, upsampled_path,
                          title='Upsampled Local Condition features, step={}, loss={:.5f}'.format(
                              global_step, loss), auto_aspect=True)

    # Save Audio
    save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize,
                     k=hparams.preemphasis)
    save_wavenet_wav(y_target, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize,
                     k=hparams.preemphasis)

    # Write eval summary to tensorboard
    log('Writing eval summary!')
    add_test_stats(summary_writer, global_step, loss, hparams=hparams)
示例#27
0
def _process_utterance(out_dir, index, wav_path, labels_path, text):
  # Load the wav file and trim silence from the ends:
  wav = audio.load_wav(wav_path)
  start_offset, end_offset = _parse_labels(labels_path)
  start = int(start_offset * hparams.sample_rate)
  end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
  wav = wav[start:end]
  max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
  if len(wav) > max_samples:
    return None
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
  spectrogram_filename = 'blizzard-spec-%05d.npy' % index
  mel_filename = 'blizzard-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
  return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(wav_path, text, id):
    '''Preprocesses a single utterance audio/text pair.
    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.
    Args:
      wav_path: Path to the audio file containing the speech input
      seq: The text in the input audio file
      id : identity
    Returns:
      A example containing many datas
    '''
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32).T
    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    return wav, spectrogram, mel_spectrogram, text, id
示例#29
0
def run_eval(args):
    print(hparams_debug_string())
    reference_mel = None
    synth = Synthesizer()
    synth.load(args.checkpoint, args.reference_audio)

    if args.reference_audio is not None:
        ref_wav = audio.load_wav(args.reference_audio)
        reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T

    base_path = get_output_base_path(args.checkpoint)

    for i, text in enumerate(sentences):
        path = '%s_%d_%.1f_%d.wav' % (base_path + '_gst', hparams.gst_index,
                                      hparams.gst_scale, i)
        print('Synthesizing: %s' % path)
        with open(path, 'wb') as f:
            f.write(synth.synthesize(text, reference_mel=reference_mel))
示例#30
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

    # Load the audio to a numpy array:
    # wav = audio.load_wav(wav_path)
    y, sr = librosa.load(wav_path, sr=hparams.sample_rate)

    # Trim the beginning and ending silence
    # Test again trimming top_db
    wav = librosa.effects.trim(y, top_db=45)[0]

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
示例#31
0
def save_log(sess, global_step, model, plot_dir, wav_dir, hparams, model_name):
    log('\nSaving intermediate states at step {}'.format(global_step))
    idx = 0
    y_hat, y, loss, length, input_pml_features, upsampled_features = sess.run([model.tower_y_hat_log[0][idx],
                                                                      model.tower_y_log[0][idx],
                                                                      model.loss,
                                                                      model.tower_input_lengths[0][idx],
                                                                      model.tower_c[0][idx],
                                                                      model.tower_upsampled_local_features[0][idx]])

    # mask by length
    y_hat[length:] = 0
    y[length:] = 0

    # Make audio and plot paths
    pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step))
    target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step))
    plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step))
    mel_path = os.path.join(plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step))
    upsampled_path = os.path.join(plot_dir, 'step-{}-upsampled-features.png'.format(global_step))

    # Save figure
    util.waveplot(plot_path, y_hat, y, hparams,
                  title='{}, {}, step={}, loss={:.5f}'.format(model_name, time_string(), global_step, loss))

    # Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
    # Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
    T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (
    0, hparams.max_abs_value)
    generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range)
    util.plot_spectrogram(generated_mel, mel_path,
                          title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'.format(
                              global_step, loss), target_spectrogram=input_pml_features.T)
    util.plot_spectrogram(upsampled_features.T, upsampled_path,
                          title='Upsampled Local Condition features, step={}, loss={:.5f}'.format(
                              global_step, loss), auto_aspect=True)

    # Save audio
    save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize,
                     k=hparams.preemphasis)
    save_wavenet_wav(y, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize,
                     k=hparams.preemphasis)
示例#32
0
def _process_utterance(out_dir, index, wav_path, pinyin):
    wav = audio.load_wav(wav_path)

    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frame = spectrogram.shape[1]
    if n_frame > hp.max_frame_num:
        return None

    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    spectrogram_filename = 'thchs30-spec-%05d.npy' % index
    mel_filename = 'thchs30-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    return (spectrogram_filename, mel_filename, n_frame, pinyin)