def __init__(self, in_dir, out_dir): self.in_dir = in_dir self.out_dir = out_dir self.books = [ 'ATrampAbroad', 'TheManThatCorruptedHadleyburg', 'LifeOnTheMississippi', 'TheAdventuresOfTomSawyer', ] self._end_buffer = 0.05 self._min_confidence = 90 self.audio = Audio(hparams)
def __init__(self, in_dir, out_dir, hparams, speaker_info_filename='speaker-info.txt'): self.in_dir = in_dir self.out_dir = out_dir self.audio = Audio(hparams) self.g2p = Flite( hparams.flite_binary_path, hparams.phoneset_path) if hparams.phoneme == 'flite' else None self.speaker_info_filename = speaker_info_filename
def predict(hparams, model_dir, postnet_model_dir, test_source_files, test_target_files): audio = Audio(hparams) def predict_input_fn(): source = tf.data.TFRecordDataset(list(test_source_files)) target = tf.data.TFRecordDataset(list(test_target_files)) dataset = DatasetSource(source, target, hparams) batched = dataset.prepare_and_zip().filter_by_max_output_length().group_by_batch(batch_size=1) return batched.dataset estimator = SingleSpeakerTacotronV1Model(hparams, model_dir) predictions = map( lambda p: PredictedMel(p["id"], p["mel"], p["mel"].shape[1], p["mel"].shape[0], p["alignment"], p["source"], p["text"]), estimator.predict(predict_input_fn)) def predict_postnet_input_fn(): prediction_dataset = tf.data.Dataset.from_generator(lambda: predictions, output_types=PredictedMel(tf.int64, tf.float32, tf.int64, tf.int64, tf.float32, tf.int64, tf.string)) target = tf.data.TFRecordDataset(list(test_target_files)) dataset = PostNetDatasetSource(target, hparams) batched = dataset.create_source_and_target().filter_by_max_output_length().combine_with_prediction( prediction_dataset).expand_batch_dim() return batched.dataset postnet_estimator = TacotronV1PostNetModel(hparams, audio, postnet_model_dir) for v in postnet_estimator.predict(predict_postnet_input_fn): filename = f"{v['id']}.wav" filepath = os.path.join(postnet_model_dir, filename) audio.save_wav(v["audio"], filepath)
def train_and_evaluate(hparams, model_dir, train_target_files, eval_target_files): audio = Audio(hparams) def train_input_fn(): shuffled_train_target_files = list(train_target_files) shuffle(shuffled_train_target_files) target = tf.data.TFRecordDataset( [t for t in shuffled_train_target_files]) dataset = PostNetDatasetSource(target, hparams) batched = dataset.create_source_and_target( ).filter_by_max_output_length().repeat().shuffle( hparams.suffle_buffer_size).group_by_batch() return batched.dataset def eval_input_fn(): shuffled_eval_target_files = list(eval_target_files) shuffle(shuffled_eval_target_files) target = tf.data.TFRecordDataset( [t for t in shuffled_eval_target_files]) dataset = PostNetDatasetSource(target, hparams) dataset = dataset.create_source_and_target( ).filter_by_max_output_length().repeat().group_by_batch(batch_size=1) return dataset.dataset run_config = tf.estimator.RunConfig( save_summary_steps=hparams.save_summary_steps, log_step_count_steps=hparams.log_step_count_steps) estimator = TacotronV1PostNetModel(hparams, audio, model_dir, config=run_config) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn) eval_spec = tf.estimator.EvalSpec( input_fn=eval_input_fn, steps=hparams.num_evaluation_steps, throttle_secs=hparams.eval_throttle_secs, start_delay_secs=hparams.eval_start_delay_secs) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
class VCTK: def __init__(self, in_dir, out_dir, hparams, speaker_info_filename='speaker-info.txt'): self.in_dir = in_dir self.out_dir = out_dir self.audio = Audio(hparams) self.g2p = Flite( hparams.flite_binary_path, hparams.phoneset_path) if hparams.phoneme == 'flite' else None self.speaker_info_filename = speaker_info_filename def list_files(self): missing = ["s5_052.txt", "s5_219.txt"] def wav_files(speaker_info: SpeakerInfo): wav_dir = os.path.join(self.in_dir, f"wav48/s{speaker_info.id}" ) if speaker_info.id == 5 else os.path.join( self.in_dir, f"wav48/p{speaker_info.id}") return [ os.path.join(wav_dir, wav_file) for wav_file in sorted(os.listdir(wav_dir)) if wav_file.endswith('_mic2.flac') ] def text_files(speaker_info: SpeakerInfo): txt_dir = os.path.join(self.in_dir, f"txt/s{speaker_info.id}" ) if speaker_info.id == 5 else os.path.join( self.in_dir, f"txt/p{speaker_info.id}") return [ os.path.join(txt_dir, txt_file) for txt_file in sorted(os.listdir(txt_dir)) if txt_file.endswith('.txt') and not os.path.basename(txt_file) in missing ] def text_and_wav_records(file_pairs, speaker_info): def create_record(txt_f, wav_f, speaker_info): key1 = os.path.basename(wav_f).replace("_mic2.flac", "") key2 = os.path.basename(txt_f).replace(".txt", "") assert key1 == key2, f"{key1} != {key2}" return TxtWavRecord(0, key1, txt_f, wav_f, speaker_info) return [ create_record(txt_f, wav_f, speaker_info) for txt_f, wav_f in file_pairs ] records = sum([ text_and_wav_records(zip(text_files(si), wav_files(si)), si) for si in self._load_speaker_info() ], []) return [ TxtWavRecord(i, r.key, r.txt_path, r.wav_path, r.speaker_info) for i, r in enumerate(records) ] def process_sources(self, rdd: RDD): return rdd.map(self._process_txt) def process_targets(self, rdd: RDD): return TargetRDD( rdd.map(self._process_wav).persist(StorageLevel.MEMORY_AND_DISK)) def _load_speaker_info(self): with open(os.path.join(self.in_dir, self.speaker_info_filename), mode='r', encoding='utf8') as f: for l in f.readlines()[1:]: si = l.split() gender = 0 if si[2] == 'F' else 1 if str(si[0][1:]) not in ["315", "362"]: # FixMe: Why 315 is missing? yield SpeakerInfo(int(si[0][1:]), int(si[1]), gender) def _process_wav(self, record: TxtWavRecord): wav = self.audio.load_wav(record.wav_path) wav = self.audio.trim(wav) mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T file_path = os.path.join(self.out_dir, f"{record.key}.target.tfrecord") write_preprocessed_target_data(record.id, record.key, mel_spectrogram, file_path) return MelStatistics(id=record.id, key=record.key, min=np.min(mel_spectrogram, axis=0), max=np.max(mel_spectrogram, axis=0), sum=np.sum(mel_spectrogram, axis=0), length=len(mel_spectrogram), moment2=np.sum(np.square(mel_spectrogram), axis=0)) def _process_txt(self, record: TxtWavRecord): with open(os.path.join(self.in_dir, record.txt_path), mode='r', encoding='utf8') as f: txt = f.readline().rstrip("\n") sequence, clean_text = text_to_sequence(txt, basic_cleaners) phone_ids, phone_txt = self.g2p.convert_to_phoneme( clean_text) if self.g2p is not None else (None, None) source = np.array(sequence, dtype=np.int64) phone_ids = np.array( phone_ids, dtype=np.int64) if phone_ids is not None else None file_path = os.path.join(self.out_dir, f"{record.key}.source.tfrecord") write_preprocessed_source_data(record.id, record.key, source, clean_text, phone_ids, phone_txt, record.speaker_info.id, record.speaker_info.age, record.speaker_info.gender, file_path) return record.key
class Synthesize: def __init__(self, in_dir, out_dir, hparams, gender, speakerID, speaker_info_filename='speaker-info.txt'): self.in_dir = in_dir self.out_dir = out_dir self.audio = Audio(hparams) self.g2p = Flite( hparams.flite_binary_path, hparams.phoneset_path) if hparams.phoneme == 'flite' else None self.speaker_info_filename = speaker_info_filename self.gender = gender self.speakerID = speakerID def list_files(self): def text_files(speaker_info: SpeakerInfo): txt_dir = self.in_dir return [ os.path.join(txt_dir, txt_file) for txt_file in sorted(os.listdir(txt_dir)) if txt_file.endswith('.txt') ] def text_and_wav_records(file_pairs, speaker_info): def create_record(txt_f, wav_f, speaker_info): wav_f = txt_f.split('.')[0] + '.wav' key1 = os.path.basename(wav_f).replace(".wav", "") key2 = os.path.basename(txt_f).replace(".txt", "") assert key1 == key2, f"{key1} != {key2}" return TxtWavRecord(0, key1, txt_f, wav_f, speaker_info) return [ create_record(txt_f, wav_f, speaker_info) for txt_f, wav_f in file_pairs ] records = sum([ text_and_wav_records(zip(text_files(si), text_files(si)), si) for si in self._load_speaker_info() ], []) return [ TxtWavRecord(i, r.key, r.txt_path, r.wav_path, r.speaker_info) for i, r in enumerate(records) ] def process_sources(self, rdd: RDD): return rdd.map(self._process_txt) def process_targets(self, rdd: RDD): return TargetRDD( rdd.map(self._process_wav).persist(StorageLevel.MEMORY_AND_DISK)) def _load_speaker_info(self): ## filling in the age field with 100, arbitrarily, since this ## does not get used at all currently gender = 0 if self.gender == 'F' else 1 yield SpeakerInfo(int(self.speakerID[1:]), 100, gender) def _process_wav(self, record: TxtWavRecord): ## dummy audio wav = np.asarray([0 for x in range(0, 48000)], dtype='float32') mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T file_path = os.path.join(self.out_dir, f"{record.key}.target.tfrecord") write_preprocessed_target_data(record.id, record.key, mel_spectrogram, file_path) return MelStatistics(id=record.id, key=record.key, min=np.min(mel_spectrogram, axis=0), max=np.max(mel_spectrogram, axis=0), sum=np.sum(mel_spectrogram, axis=0), length=len(mel_spectrogram), moment2=np.sum(np.square(mel_spectrogram), axis=0)) def _process_txt(self, record: TxtWavRecord): with open(os.path.join(self.in_dir, record.txt_path), mode='r', encoding='utf8') as f: txt = f.readline().rstrip("\n") sequence, clean_text = text_to_sequence(txt, basic_cleaners) phone_ids, phone_txt = self.g2p.convert_to_phoneme( clean_text) if self.g2p is not None else (None, None) source = np.array(sequence, dtype=np.int64) phone_ids = np.array( phone_ids, dtype=np.int64) if phone_ids is not None else None file_path = os.path.join(self.out_dir, f"{record.key}.source.tfrecord") write_preprocessed_source_data(record.id, record.key, source, clean_text, phone_ids, phone_txt, record.speaker_info.id, record.speaker_info.age, record.speaker_info.gender, file_path) return record.key
class Blizzard2012(Corpus): def __init__(self, in_dir, out_dir): self.in_dir = in_dir self.out_dir = out_dir self.books = [ 'ATrampAbroad', 'TheManThatCorruptedHadleyburg', 'LifeOnTheMississippi', 'TheAdventuresOfTomSawyer', ] self._end_buffer = 0.05 self._min_confidence = 90 self.audio = Audio(hparams) @property def training_source_files(self): return [os.path.join(self.out_dir, f"blizzard2012-source-{record_id:05d}.tfrecord") for record_id in range(321, 23204)] @property def training_target_files(self): return [os.path.join(self.out_dir, f"blizzard2012-target-{record_id:05d}.tfrecord") for record_id in range(321, 23204)] @property def validation_source_files(self): return [os.path.join(self.out_dir, f"blizzard2012-source-{record_id:05d}.tfrecord") for record_id in range(11, 321)] @property def validation_target_files(self): return [os.path.join(self.out_dir, f"blizzard2012-target-{record_id:05d}.tfrecord") for record_id in range(11, 321)] @property def test_source_files(self): return [os.path.join(self.out_dir, f"blizzard2012-source-{record_id:05d}.tfrecord") for record_id in range(1, 11)] @property def test_target_files(self): return [os.path.join(self.out_dir, f"blizzard2012-target-{record_id:05d}.tfrecord") for record_id in range(1, 11)] def text_and_path_rdd(self, sc: SparkContext): return sc.parallelize( self._extract_all_text_and_path()) def process_targets(self, rdd: RDD): return rdd.mapValues(self._process_target) def process_sources(self, rdd: RDD): return rdd.mapValues(self._process_source) def aggregate_source_metadata(self, rdd: RDD): def map_fn(splitIndex, iterator): csv, max_len, count = reduce( lambda acc, kv: ( "\n".join([acc[0], source_metadata_to_tsv(kv[1])]), max(acc[1], len(kv[1].text)), acc[2] + 1), iterator, ("", 0, 0)) filename = f"blizzard2012-source-metadata-{splitIndex:03d}.tsv" filepath = os.path.join(self.out_dir, filename) with open(filepath, mode="w") as f: f.write(csv) yield count, max_len return rdd.sortByKey().mapPartitionsWithIndex( map_fn, preservesPartitioning=True).fold( (0, 0), lambda acc, xy: (acc[0] + xy[0], max(acc[1], xy[1]))) def aggregate_target_metadata(self, rdd: RDD): def map_fn(splitIndex, iterator): csv, max_len, count = reduce( lambda acc, kv: ( "\n".join([acc[0], target_metadata_to_tsv(kv[1])]), max(acc[1], kv[1].n_frames), acc[2] + 1), iterator, ("", 0, 0)) filename = f"blizzard2012-target-metadata-{splitIndex:03d}.tsv" filepath = os.path.join(self.out_dir, filename) with open(filepath, mode="w") as f: f.write(csv) yield count, max_len return rdd.sortByKey().mapPartitionsWithIndex( map_fn, preservesPartitioning=True).fold( (0, 0), lambda acc, xy: (acc[0] + xy[0], max(acc[1], xy[1]))) def _extract_text_and_path(self, book, line, index): parts = line.strip().split('\t') if line[0] is not '#' and len(parts) == 8 and float(parts[3]) > self._min_confidence: wav_path = os.path.join(self.in_dir, book, 'wav', '%s.wav' % parts[0]) labels_path = os.path.join(self.in_dir, book, 'lab', '%s.lab' % parts[0]) text = parts[5] return TextAndPath(index, wav_path, labels_path, text) def _extract_all_text_and_path(self): index = 1 for book in self.books: with open(os.path.join(self.in_dir, book, 'sentence_index.txt'), mode='r') as f: for line in f: extracted = self._extract_text_and_path(book, line, index) if extracted is not None: yield (index, extracted) index += 1 def _load_labels(self, path): labels = [] with open(os.path.join(path)) as f: for line in f: parts = line.strip().split(' ') if len(parts) >= 3: labels.append((float(parts[0]), ' '.join(parts[2:]))) start = 0 end = None if labels[0][1] == 'sil': start = labels[0][0] if labels[-1][1] == 'sil': end = labels[-2][0] + self._end_buffer return (start, end) def _text_to_sequence(self, text): sequence = [ord(c) for c in text] + [eos] sequence = np.array(sequence, dtype=np.int64) return sequence def _process_target(self, paths: TextAndPath): wav = self.audio.load_wav(paths.wav_path) start_offset, end_offset = self._load_labels(paths.labels_path) start = int(start_offset * hparams.sample_rate) end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1 wav = wav[start:end] spectrogram = self.audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32) filename = f"blizzard2012-target-{paths.id:05d}.tfrecord" filepath = os.path.join(self.out_dir, filename) tfrecord.write_preprocessed_target_data(paths.id, spectrogram.T, mel_spectrogram.T, filepath) return TargetMetaData(paths.id, filepath, n_frames) def _process_source(self, paths: TextAndPath): sequence = self._text_to_sequence(paths.text) filename = f"blizzard2012-source-{paths.id:05d}.tfrecord" filepath = os.path.join(self.out_dir, filename) tfrecord.write_preprocessed_source_data2(paths.id, paths.text, sequence, paths.text, sequence, filepath) return SourceMetaData(paths.id, filepath, paths.text)