def agumentation(arpabet_dict, audio_paths, target_spk_id_list, output_path, ljs=False): if not os.path.exists(output_path): os.makedirs(output_path) # Step1: Basic Setups if not ljs: # Whether to use lj speech checkpoint_path = "mellotron_libritts.pt" else: checkpoit_path = "mellotron_ljs.pt" if torch.cuda.is_available(): tacotron = load_model(hparams).cuda().eval() else: tacotron = load_model(hparams).eval() tacotron.load_state_dict( torch.load(checkpoint_path, map_location="cpu")['state_dict']) waveglow_path = 'waveglow_256channels_v4.pt' if torch.cuda.is_available(): waveglow = torch.load(waveglow_path)['model'].cuda().eval() denoiser = Denoiser(waveglow).cuda().eval() else: waveglow = torch.load(waveglow_path, map_location="cpu")['model'].eval().cpu() denoiser = Denoiser(waveglow).eval() arpabet_dict = cmudict.CMUDict(arpabet_dict) dataloader = TextMelLoader(audio_paths, hparams) datacollate = TextMelCollate(1) # Step2: Load for file_idx in range(len(dataloader)): source_scp = open(os.path.join(output_path, "source.scp"), "w", encoding="utf-8") audio_path, text, sid = dataloader.audiopaths_and_text[file_idx] source_scp.write("{} {}\n".format(file_idx, audio_path)) # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :] pitch_contour = dataloader[file_idx][3][None] if torch.cuda.is_available(): text_encoded = text_encoded.cuda() pitch_contour = pitch_contour.cuda() mel = load_mel(audio_path) # load source data to obtain rhythm using tacotron 2 as a forced aligner x, y = tacotron.parse_batch(datacollate([dataloader[file_idx]])) # Step3: Perform speaker transfer with torch.no_grad(): # get rhythm (alignment map) using tacotron 2 mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = tacotron.forward( x) rhythm = rhythm.permute(1, 0, 2) for spk_id in target_spk_id_list: speaker_id = torch.LongTensor([spk_id]) if torch.cuda.is_available(): speaker_id = speaker_id.cuda() with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = tacotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour * 0.4, rhythm)) with torch.no_grad(): audio = denoiser( waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0] sf.write( os.path.join(output_path, "{}-{}.wav".format(file_idx, spk_id)), audio.detach().cpu().numpy().T, hparams.sampling_rate)
def inference(dirname, outdir, checkpoint_path, sentence_list, parallel=False): # 멜로트론 로딩 mellotron = load_model(hparams).cuda().eval() mellotron.load_state_dict(torch.load(checkpoint_path)['state_dict']) # 보코더 로딩 vocoder = get_vocoder() # 오디오 filelist 로딩 arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') audio_paths = f'data/{dirname}.txt' dataloader = TextMelLoader(audio_paths, hparams, speaker_ids=speaker_id_map) os.makedirs(f'{outdir}/{os.path.basename(checkpoint_path)}/{dirname}', exist_ok=True) with open('data/VCTK/speaker-dict.json') as f: speakers = json.load(f) new_filelist = [] t0 = time.time() cnt = 0 for file_idx in range(len(dataloader)): audio_path, text, sid = dataloader.audiopaths_and_text[file_idx] if not parallel: for sent_txt in sentence_list: text = sent_txt # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda() pitch_contour = dataloader[file_idx][3][None].cuda() mel = load_mel(audio_path) print(audio_path, text) # 스피커 id # speaker_name = os.path.basename(audio_path).split('_')[1] # speaker_id = speakers.index(speaker_name) speaker_id = int(sid) # speaker_id_mapped = speaker_id_map[speaker_id] speaker_id = torch.LongTensor([speaker_id]).cuda() # 멜로트론 합성 with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference( (text_encoded, mel, speaker_id, pitch_contour)) # wav 합성 text_save = text[:100] if len(text) > 100 else text sample_name = f'{os.path.splitext(os.path.basename(audio_path))[0]}-{text_save}.wav' vocoder_infer( mel_outputs_postnet, vocoder, f'{outdir}/{os.path.basename(checkpoint_path)}/{dirname}/{sample_name}' ) new_filelist.append( f'{outdir}/{os.path.basename(checkpoint_path)}/{dirname}/{sample_name}\n' ) cnt += 1 else: # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda() pitch_contour = dataloader[file_idx][3][None].cuda() mel = load_mel(audio_path) print(audio_path, text) # 스피커 id # speaker_name = os.path.basename(audio_path).split('_')[1] # speaker_id = speakers.index(speaker_name) speaker_id = int(sid) speaker_id_mapped = speaker_id_map[speaker_id] speaker_id = torch.LongTensor([speaker_id_mapped]).cuda() # 멜로트론 합성 with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference( (text_encoded, mel, speaker_id, pitch_contour)) # wav 합성 text_save = text[:10] if len(text) > 10 else text sample_name = f'{os.path.splitext(os.path.basename(audio_path))[0]}-{text_save}.wav' vocoder_infer( mel_outputs_postnet, vocoder, f'{outdir}/{os.path.basename(checkpoint_path)}/{dirname}/{sample_name}' ) new_filelist.append( f'{outdir}/{os.path.basename(checkpoint_path)}/{dirname}/{sample_name}\n' ) cnt += 1 with open(f'{outdir}/{os.path.basename(checkpoint_path)}/{dirname}.txt', 'w') as f: f.writelines(new_filelist) t1 = time.time() print(f'Average inference time: {(t1 - t0) / cnt:.6f}')
hparams = create_hparams() hparams.batch_size = 1 stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) # speaker = "fv02" checkpoint_path = '/mnt/sdc1/pitchtron/grl_200224/checkpoint_291000' f0s_meta_path = '/mnt/sdc1/pitchtron/single_init_200123/f0s_combined.txt' # "models/pitchtron_libritts.pt" pitchtron = load_model(hparams).cuda().eval() pitchtron.load_state_dict(torch.load(checkpoint_path)['state_dict']) waveglow_path = '/home/admin/projects/pitchtron_init_with_single/models/waveglow_256channels_v4.pt' waveglow = torch.load(waveglow_path)['model'].cuda().eval() denoiser = Denoiser(waveglow).cuda().eval() arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') audio_paths = 'data/examples_pfp_single_sample.txt' test_set = TextMelLoader(audio_paths, hparams) datacollate = TextMelCollate(1) dataloader = DataLoader(test_set, num_workers=1, shuffle=False, batch_size=hparams.batch_size, pin_memory=False, drop_last=False, collate_fn=datacollate) speaker_ids = TextMelLoader( "filelists/wav_less_than_12s_158_speakers_train.txt", hparams).speaker_ids # speaker_id = torch.LongTensor([speaker_ids[speaker]]).cuda() # Load mean f0
def __init__(self, coordinator, data_paths, hparams): super(DataFeeder, self).__init__() self._coord = coordinator self._hparams = hparams self.data_paths = data_paths self.data_path_to_id = { data_path: _id for _id, data_path in enumerate(data_paths) } prefixes_dict = {} offset_dict = {} for data_path in data_paths: prefixes = [] with open(os.path.join(data_path, 'ids.train'), 'r') as fi: for line in fi: line = line.strip() if line: prefixes.append(line) prefixes_dict[data_path] = prefixes offset_dict[data_path] = 0 self._prefixes_dict = prefixes_dict self._offset_dict = offset_dict self._placeholders = [ tf.placeholder(tf.float32, [None, None, hparams.num_labs], 'inputs'), tf.placeholder(tf.int32, [None], 'input_lengths'), tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets'), tf.placeholder(tf.string, [None], 'prefixes'), tf.placeholder(tf.int32, [None], 'speaker_ids'), tf.placeholder(tf.int32, [None], 'target_lengths') ] # Create queue for buffering data: queue = tf.FIFOQueue(8, [ tf.float32, tf.int32, tf.float32, tf.float32, tf.string, tf.int32, tf.int32 ], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.linear_targets, self.prefixes, self.speaker_ids, self.target_lengths = queue.dequeue( ) self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.linear_targets.set_shape(self._placeholders[3].shape) self.prefixes.set_shape(self._placeholders[4].shape) self.speaker_ids.set_shape(self._placeholders[5].shape) self.target_lengths.set_shape(self._placeholders[6].shape) # Load CMUDict: If enabled, this will randomly substitute some words in the training data with # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for # synthesis (useful for proper nouns, etc.) if hparams.use_cmudict: cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b') if not os.path.isfile(cmudict_path): raise Exception( 'If use_cmudict=True, you must download ' + 'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path) self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False) log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict)) else: self._cmudict = None
import re import numpy as np import music21 as m21 import torch import torch.nn.functional as F from text import text_to_sequence, get_arpabet, cmudict CMUDICT_PATH = "data/cmu_dictionary" CMUDICT = cmudict.CMUDict(CMUDICT_PATH) PHONEME2GRAPHEME = { 'AA': ['a', 'o', 'ah'], 'AE': ['a', 'e'], 'AH': ['u', 'e', 'a', 'h', 'o'], 'AO': ['o', 'u', 'au'], 'AW': ['ou', 'ow'], 'AX': ['a'], 'AXR': ['er'], 'AY': ['i'], 'EH': ['e', 'ae'], 'EY': ['a', 'ai', 'ei', 'e', 'y'], 'IH': ['i', 'e', 'y'], 'IX': ['e', 'i'], 'IY': ['ea', 'ey', 'y', 'i'], 'OW': ['oa', 'o'], 'OY': ['oy'], 'UH': ['oo'], 'UW': ['oo', 'u', 'o'], 'UX': ['u'], 'B': ['b'], 'CH': ['ch', 'tch'], 'D': ['d', 'e', 'de'],
def __init__(self, coordinator, metadata_filename, hparams): super(DataFeeder, self).__init__() self._coord = coordinator self._hparams = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] self._offset = 0 # Load metadata: self._datadir = os.path.dirname(metadata_filename) with open(metadata_filename, encoding='utf-8') as f: self._metadata = [] for line in f: sp = line.strip().split('|') if int(sp[2]) >= hparams.outputs_per_step * hparams.max_iters: continue try: text_to_sequence(sp[3], self._cleaner_names) except: continue self._metadata.append(sp) # self._metadata = [line.strip().split('|') for line in f] hours = sum( (int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000) log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours)) # Create placeholders for inputs and targets. Don't specify batch size because we want to # be able to feed different sized batches at eval time. self._placeholders = [ tf.placeholder(tf.int32, [None, None], 'inputs'), tf.placeholder(tf.int32, [None], 'input_lengths'), tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets') ] # Create queue for buffering data: queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = queue.dequeue( ) self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.linear_targets.set_shape(self._placeholders[3].shape) # Load CMUDict: If enabled, this will randomly substitute some words in the training data with # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for # synthesis (useful for proper nouns, etc.) if hparams.use_cmudict: cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b') if not os.path.isfile(cmudict_path): raise Exception( 'If use_cmudict=True, you must download ' + 'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path) self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False) log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict)) else: self._cmudict = None
def __init__(self, coordinator, metadata_filename_pos, metadata_filename_neg, hparams): super(DataFeeder, self).__init__() self._coord = coordinator self._hparams = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] self._offset = 0 # Load metadata: # load data from both positive filename and negative filename self._datadir = os.path.dirname(metadata_filename_pos) #self._datadir_neg = os.path.dirname(metadata_filename_neg) with open(metadata_filename_pos, encoding='utf-16') as f: self._metadata_pos = [line.strip().split('|') for line in f] hours = sum((int(x[2]) for x in self._metadata_pos)) * hparams.frame_shift_ms / (3600 * 1000) log('Loaded positive metadata for %d examples (%.2f hours)' % (len(self._metadata_pos), hours)) with open(metadata_filename_neg, encoding='utf-16') as f: self._metadata_neg = [line.strip().split('|') for line in f] hours = sum((int(x[2]) for x in self._metadata_neg)) * hparams.frame_shift_ms / (3600 * 1000) log('Loaded negative metadata for %d examples (%.2f hours)' % (len(self._metadata_neg), hours)) # Create placeholders for inputs and targets. Don't specify batch size because we want to # be able to feed different sized batches at eval time. self._placeholders = [ tf.placeholder(tf.int32, [None, None], 'inputs_pos'), tf.placeholder(tf.int32, [None], 'input_lengths_pos'), tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets_pos'), tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets_pos'), tf.placeholder(tf.int32, [None, None], 'inputs_neg'), tf.placeholder(tf.int32, [None], 'input_lengths_neg'), tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets_neg'), tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets_neg'), tf.placeholder(tf.int32, [None, 4], 'pos_labels'), tf.placeholder(tf.int32, [None, 4], 'neg_labels') ] # Create queue for buffering data: queue = tf.FIFOQueue(16, [tf.int32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs_pos, self.input_lengths_pos, self.mel_targets_pos, self.linear_targets_pos,self.inputs_neg, self.input_lengths_neg, self.mel_targets_neg, self.linear_targets_neg = queue.dequeue() self.inputs_pos.set_shape(self._placeholders[0].shape) self.input_lengths_pos.set_shape(self._placeholders[1].shape) self.mel_targets_pos.set_shape(self._placeholders[2].shape) self.linear_targets_pos.set_shape(self._placeholders[3].shape) self.inputs_neg.set_shape(self._placeholders[0].shape) self.input_lengths_neg.set_shape(self._placeholders[1].shape) self.mel_targets_neg.set_shape(self._placeholders[2].shape) self.linear_targets_neg.set_shape(self._placeholders[3].shape) self.labels_pos.set_shape(self._placeholders[8].shape) self.labels_pos.set_shape(self._placeholders[8].shape) # Load CMUDict: If enabled, this will randomly substitute some words in the training data with # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for # synthesis (useful for proper nouns, etc.) if hparams.use_cmudict: cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b') if not os.path.isfile(cmudict_path): raise Exception('If use_cmudict=True, you must download cmu dictionary first. ' + 'Run shell as:\n wget -P %s http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b' % self._datadir) self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False) log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict)) else: self._cmudict = None
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) # else: # torch.cuda.set_device('cuda:1') torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() waveglow_path = 'waveglow_256channels_universal_v5.pt' waveglow = torch.load(waveglow_path)['model'] waveglow.cuda().eval().float() # waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() # ---------------------- MELLOTRON CODE BLOCK -------------------------- arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') audio_paths = 'data/examples_filelist.txt' dataloader = TextMelLoader(audio_paths, hparams) datacollate = TextMelCollate(hparams.n_frames_per_step) file_idx = 0 audio_path, text, sid = dataloader.audiopaths_and_text[file_idx] stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) def load_mel(path): audio, sampling_rate = librosa.core.load(path, sr=hparams.sampling_rate) audio = torch.from_numpy(audio) if sampling_rate != hparams.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, stft.sampling_rate)) audio_norm = audio.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm) melspec = melspec.cuda() return melspec # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda() mel = load_mel(audio_path) print(audio_path, text) inference_batch = datacollate([dataloader[file_idx]]) # ---------------------- MELLOTRON CODE BLOCK (END) -------------------------- logger = prepare_directories_and_logger(output_directory, log_directory, rank) train_loader, valset, collate_fn, train_sampler = prepare_dataloaders( hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) if train_sampler is not None: train_sampler.set_epoch(epoch) for i, batch in enumerate(train_loader): start = time.perf_counter() if iteration > 0 and iteration % hparams.learning_rate_anneal == 0: learning_rate = max(hparams.learning_rate_min, learning_rate * 0.5) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) # if not is_overflow and (iteration % 2 == 0): log_audio(model, iteration, logger, waveglow, inference_batch, text_encoded, mel) iteration += 1
def __init__(self, coordinator, training_path, hparams): super(DataFeeder, self).__init__() self._coord = coordinator self._hparams = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] self._offset = 0 self._offset_person_id = 0 self._batch_in_queue = 0 self._datasets = hparams.datasets # Load metadata: #self._datadir = os.path.dirname(metadata_filename) #with open(metadata_filename, encoding='utf-8') as f: # self._metadata = [line.strip().split('|') for line in f] # hours = sum((int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000) # log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours)) # self._metadata # self._datadir self._metadata = [] global_parson_id = 0 for dataset in self._datasets: metadata_filename = os.path.join(training_path, dataset, 'train.txt') datadir = os.path.dirname(metadata_filename) #exist_person_id correlate the global_person_id with current person_id exist_person_id = {} with open(metadata_filename, encoding='utf-8') as f: metadata = [line.strip().split('|') for line in f] hours = sum( (int(x[2]) for x in metadata)) * hparams.frame_shift_ms / (3600 * 1000) log('Loaded ' + dataset + ' metadata for %d examples (%.2f hours)' % (len(metadata), hours)) for item in metadata: #item=[vctk-spec-23918.npy,vctk-mel-23918.npy,329,They say that vital evidence was not heard in court.,60] person_id = item[4] item[0] = os.path.join(datadir, item[0]) item[1] = os.path.join(datadir, item[1]) if not person_id in exist_person_id: exist_person_id[person_id] = global_parson_id global_parson_id += 1 self._metadata.append([]) global_person_id_crrt = exist_person_id[person_id] self._metadata[global_person_id_crrt].append(item) # Create placeholders for inputs and targets. Don't specify batch size because we want to # be able to feed different sized batches at eval time. self._placeholders = [ tf.placeholder(tf.int32, [None, None], 'inputs'), tf.placeholder(tf.int32, [None], 'input_lengths'), tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets') ] # Create queue for buffering data: queue = tf.FIFOQueue(100, [tf.int32, tf.int32, tf.float32, tf.float32], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = queue.dequeue( ) self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.linear_targets.set_shape(self._placeholders[3].shape) # Load CMUDict: If enabled, this will randomly substitute some words in the training data with # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for # synthesis (useful for proper nouns, etc.) if hparams.use_cmudict: cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b') if not os.path.isfile(cmudict_path): raise Exception( 'If use_cmudict=True, you must download ' + 'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path) self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False) log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict)) else: self._cmudict = None
def test_cmudict_no_keep_ambiguous(): c = cmudict.CMUDict(io.StringIO(test_data), keep_ambiguous=False) assert len(c) == 5 assert c.lookup('adversity') == ['AE0 D V ER1 S IH0 T IY2'] assert c.lookup('adverse') == None
def __init__(self, coordinator, metadata_filename, hparams): super(DataFeeder, self).__init__() self._coord = coordinator self._hparams = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] self.train_offset = 0 self.test_offset = 0 # Load metadata: self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels') self._datadir = os.path.dirname(metadata_filename) self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear') with open(metadata_filename, encoding='utf-8') as f: self._metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum( (int(x[4]) for x in self._metadata)) * frame_shift_ms / 3600 log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours)) #Train test split if hparams.gst_test_size is None: assert hparams.gst_test_batches is not None test_size = (hparams.gst_test_size if hparams.gst_test_size is not None else hparams.gst_test_batches * hparams.batch_size) indices = np.arange(len(self._metadata)) train_indices, test_indices = train_test_split( indices, test_size=test_size, random_state=hparams.gst_data_random_state) #Make sure test_indices is a multiple of batch_size else round up len_test_indices = _round_up(len(test_indices), hparams.batch_size) extra_test = test_indices[len_test_indices:] test_indices = test_indices[:len_test_indices] train_indices = np.concatenate([train_indices, extra_test]) self._train_meta = list(np.array(self._metadata)[train_indices]) self._test_meta = list(np.array(self._metadata)[test_indices]) self.test_steps = len(self._test_meta) // hparams.batch_size if hparams.gst_test_size is None: assert hparams.gst_test_batches == self.test_steps # Create placeholders for inputs and targets. Don't specify batch size because we want to # be able to feed different sized batches at eval time. self._placeholders = [ tf.placeholder(tf.int32, [None, None], 'inputs'), tf.placeholder(tf.int32, [None], 'input_lengths'), tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), tf.placeholder(tf.float32, shape=(None, None), name='token_targets'), tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'), tf.placeholder(tf.int32, shape=(None, ), name='targets_lengths'), ] # Create queue for buffering data: queue = tf.FIFOQueue( 8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.token_targets, self.linear_targets, self.targets_lengths = queue.dequeue( ) self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.token_targets.set_shape(self._placeholders[3].shape) self.linear_targets.set_shape(self._placeholders[4].shape) self.targets_lengths.set_shape(self._placeholders[5].shape) # Create eval queue for buffering eval data eval_queue = tf.FIFOQueue( 1, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32], name='eval_queue') self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, self.eval_token_targets, \ self.eval_linear_targets, self.eval_targets_lengths = eval_queue.dequeue() self.eval_inputs.set_shape(self._placeholders[0].shape) self.eval_input_lengths.set_shape(self._placeholders[1].shape) self.eval_mel_targets.set_shape(self._placeholders[2].shape) self.eval_token_targets.set_shape(self._placeholders[3].shape) self.eval_linear_targets.set_shape(self._placeholders[4].shape) self.eval_targets_lengths.set_shape(self._placeholders[5].shape) # Load CMUDict: If enabled, this will randomly substitute some words in the training data with # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for # synthesis (useful for proper nouns, etc.) if hparams.use_cmudict: cmudict_path = os.path.join(os.path.dirname(metadata_filename), 'cmudict-0.7b') if not os.path.isfile(cmudict_path): raise Exception( 'If use_cmudict=True, you must download cmu dictionary first. ' + 'Run shell as:\n wget -P %s http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b' % self._datadir) self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False) log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict)) else: self._cmudict = None