def predict(hparams, model_dir, checkpoint_path, output_dir, test_files): audio = Audio(hparams) def predict_input_fn(): records = tf.data.TFRecordDataset(list(test_files)) dataset = DatasetSource(records, hparams) batched = dataset.make_source_and_target().group_by_batch( batch_size=1).arrange_for_prediction() return batched.dataset estimator = WaveNetModel(hparams, model_dir) predictions = map( lambda p: PredictedAudio(p["id"], p["key"], p["predicted_waveform"], p[ "ground_truth_waveform"], p["mel"], p["text"]), estimator.predict(predict_input_fn, checkpoint_path=checkpoint_path)) for v in predictions: key = v.key.decode('utf-8') audio_filename = f"{key}.wav" audio_filepath = os.path.join(output_dir, audio_filename) tf.logging.info(f"Saving {audio_filepath}") audio.save_wav(v.predicted_waveform, audio_filepath) png_filename = f"{key}.png" png_filepath = os.path.join(output_dir, png_filename) tf.logging.info(f"Saving {png_filepath}") # ToDo: pass global step plot_wav(png_filepath, v.predicted_waveform, v.ground_truth_waveform, key, 0, v.text.decode('utf-8'), hparams.sample_rate)
def main(args, hp): with torch.no_grad(): model = VoiceFilter(hp).cuda() chkpt_model = torch.load(args.checkpoint_path)['model'] model.load_state_dict(chkpt_model) model.eval() embedder = SpeechEmbedder(hp).cuda() chkpt_embed = torch.load(args.embedder_path) embedder.load_state_dict(chkpt_embed) embedder.eval() audio = Audio(hp) ref_wav, _ = librosa.load(args.reference_file, sr=16000) ref_mel = audio.get_mel(ref_wav) ref_mel = torch.from_numpy(ref_mel).float().cuda() dvec = embedder(ref_mel) dvec = dvec.unsqueeze(0) mixed_wav, _ = librosa.load(args.mixed_file, sr=16000) mixed_mag, mixed_phase = audio.wav2spec(mixed_wav) mixed_mag = torch.from_numpy(mixed_mag).float().cuda() mixed_mag = mixed_mag.unsqueeze(0) shadow_mag = model(mixed_mag, dvec) shadow_mag = shadow_mag[0].cpu().detach().numpy() recorded_mag = tensor_normalize(mixed_mag + shadow_mag) recorded_mag = recorded_mag[0].cpu().detach().numpy() recorded_wav = audio.spec2wav(recorded_mag, mixed_mag) os.makedirs(args.out_dir, exist_ok=True) out_path = os.path.join(args.out_dir, 'result.wav') librosa.output.write_wav(out_path, recorded_wav, sr=16000)
def __init__(self, model_path, out_dir, text_file, sil_file, use_griffin_lim, hparams): self.model_path = model_path self.out_dir = out_dir self.text_file = text_file self.sil_file = sil_file self.use_griffin_lim = use_griffin_lim self.hparams = hparams self.model = get_model(model_path, hparams) self.audio_class = Audio(hparams) if hparams.use_phone: from text.phones import Phones phone_class = Phones(hparams.phone_set_file) self.text_to_sequence = phone_class.text_to_sequence else: from text import text_to_sequence self.text_to_sequence = text_to_sequence # self.out_png_dir = os.path.join(self.out_dir, 'png') # os.makedirs(self.out_png_dir, exist_ok=True) self.out_wav_dir = os.path.join(self.out_dir, 'wav') os.makedirs(self.out_wav_dir, exist_ok=True)
def main(args, hp): model = VoiceFilter(hp).cuda() chkpt_model = torch.load(args.checkpoint_path)['model'] model.load_state_dict(chkpt_model) model.eval() embedder = SpeechEmbedder(hp).cuda() chkpt_embed = torch.load(args.embedder_path) embedder.load_state_dict(chkpt_embed) embedder.eval() audio = Audio(hp) dvec_wav, _ = librosa.load(args.reference_file, sr=16000) dvec_mel = audio.get_mel(dvec_wav) dvec_mel = torch.from_numpy(dvec_mel).float().cuda() dvec = embedder(dvec_mel) dvec = dvec.unsqueeze(0) mixed_wav, _ = librosa.load(args.mixed_file, sr=16000) mag, phase = audio.wav2spec(mixed_wav) mag = torch.from_numpy(mag).float().cuda() mag = mag.unsqueeze(0) mask = model(mag, dvec) est_mag = mag * mask est_mag = est_mag[0].cpu().detach().numpy() est_wav = audio.spec2wav(est_mag, phase) os.makedirs(args.out_dir, exist_ok=True) out_path = os.path.join(args.out_dir, 'result.wav') librosa.output.write_wav(out_path, est_wav, sr=16000)
def __init__(self, model_path, out_dir, text_file, sil_file, use_griffin_lim, gen_wavenet_fea, hparams): self.out_dir = out_dir self.text_file = text_file self.sil_file = sil_file self.use_griffin_lim = use_griffin_lim self.gen_wavenet_fea = gen_wavenet_fea self.hparams = hparams self.model = get_model(model_path, hparams) self.audio_class = Audio(hparams) if hparams.use_phone: from text.phones import Phones phone_class = Phones(hparams.phone_set_file) self.text_to_sequence = phone_class.text_to_sequence else: from text import text_to_sequence self.text_to_sequence = text_to_sequence if hparams.is_multi_speakers and not hparams.use_pretrained_spkemb: self.speaker_id_dict = gen_speaker_id_dict(hparams) self.out_png_dir = os.path.join(self.out_dir, 'png') os.makedirs(self.out_png_dir, exist_ok=True) if self.use_griffin_lim: self.out_wav_dir = os.path.join(self.out_dir, 'wav') os.makedirs(self.out_wav_dir, exist_ok=True) if self.gen_wavenet_fea: self.out_mel_dir = os.path.join(self.out_dir, 'mel') os.makedirs(self.out_mel_dir, exist_ok=True)
def __init__(self, hp, args, train): def find_all(file_format): # return sorted(glob.glob(os.path.join(self.data_dir, file_format))) return sorted( glob.glob(os.path.join(self.data_dir, '**', file_format), recursive=True)) # return sorted(glob.glob(os.path.join(self.data_dir, file_format))) self.hp = hp self.args = args self.train = train self.data_dir = hp.data.train_dir if train else hp.data.test_dir self.dvec_list = find_all(hp.form.dvec) self.target_wav_list = find_all(hp.form.target.wav) self.mixed_wav_list = find_all(hp.form.mixed.wav) self.target_mag_list = find_all(hp.form.target.mag) self.mixed_mag_list = find_all(hp.form.mixed.mag) assert len(self.dvec_list) == len(self.target_wav_list) == len(self.mixed_wav_list) == \ len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match" assert len(self.dvec_list) != 0, \ "no training file found" self.audio = Audio(hp)
def __init__(self, in_dir, out_dir, hparams, speaker_info_filename='speaker-info.txt'): self.in_dir = in_dir self.out_dir = out_dir self.speaker_info_filename = speaker_info_filename self.audio = Audio(hparams)
class VFDataset(Dataset): def __init__(self, train): def find_all(file_format): return sorted(glob.glob(os.path.join(self.data_dir, file_format))) self.train = train self.data_dir = config.data['base_dir'] + config.data[ 'train_dir'] if train else config.data['base_dir'] + config.data[ 'test_dir'] self.dvec_list = find_all(config.form['dvec']) self.target_wav_list = find_all(config.form['target']['wav']) self.mixed_wav_list = find_all(config.form['mixed']['wav']) self.target_mag_list = find_all(config.form['target']['mag']) self.mixed_mag_list = find_all(config.form['mixed']['mag']) assert len(self.dvec_list) == len(self.target_wav_list) == len(self.mixed_wav_list) == \ len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match" assert len(self.dvec_list) != 0, \ "no training file found" self.audio = Audio() def __len__(self): return len(self.dvec_list) def __getitem__(self, idx): with open(self.dvec_list[idx], 'r') as f: dvec_path = f.readline().strip() dvec_wav, _ = librosa.load(config.data['base_dir'] + dvec_path, sr=config.audio['sample_rate']) dvec_mel = self.audio.get_mel(dvec_wav) dvec_mel = torch.from_numpy(dvec_mel).float() if self.train: # need to be fast target_mag = torch.load(self.target_mag_list[idx]) mixed_mag = torch.load(self.mixed_mag_list[idx]) return dvec_mel, target_mag, mixed_mag else: target_wav, _ = librosa.load(self.target_wav_list[idx], config.audio['sample_rate']) mixed_wav, _ = librosa.load(self.mixed_wav_list[idx], config.audio['sample_rate']) target_mag, _ = self.wav2magphase(self.target_wav_list[idx]) mixed_mag, mixed_phase = self.wav2magphase( self.mixed_wav_list[idx]) target_mag = torch.from_numpy(target_mag) mixed_mag = torch.from_numpy(mixed_mag) # mixed_phase = torch.from_numpy(mixed_phase) return dvec_mel, target_wav, mixed_wav, target_mag, mixed_mag, mixed_phase def wav2magphase(self, path): wav, _ = librosa.load(path, config.audio['sample_rate']) mag, phase = self.audio.wav2spec(wav) return mag, phase
class VFDataset(Dataset): def __init__(self, hp, args, train): def find_all(file_format): return sorted(glob.glob(os.path.join(self.data_dir, file_format))) self.hp = hp self.args = args self.train = train self.data_dir = hp.data.train_dir if train else hp.data.test_dir self.dvec_list = find_all(hp.form.dvec) self.target_wav_list = find_all(hp.form.target.wav) self.mixed_wav_list = find_all(hp.form.mixed.wav) self.target_mag_list = find_all(hp.form.target.mag) self.mixed_mag_list = find_all(hp.form.mixed.mag) assert len(self.dvec_list) == len(self.target_wav_list) == len(self.mixed_wav_list) == \ len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match" assert len(self.dvec_list) != 0, \ "no training file found" self.audio = Audio(hp) def __len__(self): return len(self.dvec_list) def __getitem__(self, idx): with open(self.dvec_list[idx], 'r') as f: dvec_path = f.readline().strip() dvec_wav, _ = librosa.load(dvec_path, sr=self.hp.audio.sample_rate) dvec_mel = self.audio.get_mel(dvec_wav) dvec_mel = torch.from_numpy(dvec_mel).float() if self.train: # need to be fast target_mag = torch.load(self.target_mag_list[idx]) mixed_mag = torch.load(self.mixed_mag_list[idx]) return dvec_mel, target_mag, mixed_mag else: target_wav, _ = librosa.load(self.target_wav_list[idx], self.hp.audio.sample_rate) mixed_wav, _ = librosa.load(self.mixed_wav_list[idx], self.hp.audio.sample_rate) target_mag, _ = self.wav2magphase(self.target_wav_list[idx]) mixed_mag, mixed_phase = self.wav2magphase( self.mixed_wav_list[idx]) target_mag = torch.from_numpy(target_mag) mixed_mag = torch.from_numpy(mixed_mag) # mixed_phase = torch.from_numpy(mixed_phase) return dvec_mel, target_wav, mixed_wav, target_mag, mixed_mag, mixed_phase def wav2magphase(self, path): wav, _ = librosa.load(path, self.hp.audio.sample_rate) mag, phase = self.audio.wav2spec(wav) return mag, phase
class SoundRecognitionApp(): def __init__(self, cfg) -> None: self.transformer = Signal2ImageTransformer(**cfg['transforms']) self.audio = Audio(cfg['audio']) self.load_model(cfg['model']) pass def run(self): print("============= REALTIME START ==============") self.audio.start() self.flag = True try: while self.flag: status, data = self.audio.get() if status == Audio.ERROR: print('[error]') break elif status == Audio.WAIT: continue mel_spec = self.preprocess(data) result = self.inference(mel_spec) except KeyboardInterrupt: pass except Exception as e: print(e) finally: self.audio.stop() print("============= REALTIME FINISH ==============") def preprocess(self, signal): return np.expand_dims(self.transformer.transform(signal), axis=0) def inference(self, X): image = torch.from_numpy(X.astype(np.float32)).clone() image.to(self.device).float() prob = self.model(image)['multilabel_proba'].detach().cpu().numpy() return prob def load_model(self, cfg): try: self.device = torch.device(cfg["device"]) self.model = getattr(ml.my_model, cfg['name'])(**cfg['params']) self.model.load_state_dict(torch.load(cfg['path'])) self.model.to(self.device) except AttributeError as e: print(f"Model {cfg['name']} is None. {e}") exit(1) except FileNotFoundError as e: print(f"{e}") exit(1) except Exception as e: print(f"{e}") exit(1)
def crop_media(candidates, base_path, out_path='for_axlotl'): for session_id, session in candidates.items(): result_top_path = os.path.join(out_path, session_id) if not os.path.isdir(result_top_path): os.makedirs(result_top_path) for text_path, contents in session.items(): # the url element should always have a single file paths = create_local_paths(base_path, (text_path, '', contents['urls'][0][1])) result_basename = os.path.basename(paths['audio_path']).\ split('.')[0] result_text = os.path.join(result_top_path, result_basename)+\ '.txt' # if audio file exists if os.path.isfile(paths['audio_path']): # if there is only one speaker in the intervention if os.path.isfile(result_text): msg = 'skipping. processed text file %s exists'%result_text logging.info(msg) else: if len(contents['text']) == 1: text = contents['text'][0][1] full_text = ' '.join(tokenize(text)).lower() clean_text = re.sub(token_clean, '', full_text) audio_file = Audio(paths['audio_path']) trimmer = Trimmer(clean_text, audio_file) try: start, end, start_word_i, end_word_i =\ trimmer.crop_longaudio() except Exception as e: print(e) print((clean_text[:100], clean_text[-100:])) raise ValueError() print(text_path) if start and end: msg = '%s start and end matched for cropping'\ %contents['text'] full_words = full_text.split() new_text = ' '.join(full_words[start_word_i:\ end_word_i]) print(start, end, new_text[:100], new_text[-100:]) with open(result_text, 'w') as out: out.write(new_text) result_audio = audio_file.segment(start=start, end=end+0.2, outpath=result_top_path) print(result_audio, result_text) else: msg = '%s matching the start and end failed'\ %contents['text']
def __init__(self, model: tf.keras.models.Model, log_dir: str, config: dict, max_plot_frequency=10, default_writer='log_dir'): self.model = model self.log_dir = Path(log_dir) self.config = config self.audio = Audio(config) self.plot_frequency = max_plot_frequency self.default_writer = default_writer self.writers = {} self.add_writer(tag=default_writer, path=self.log_dir, default=True)
class LJSpeech: def __init__(self, in_dir, out_dir, hparams): self.in_dir = in_dir self.out_dir = out_dir self.audio = Audio(hparams) def text_and_audio_path_rdd(self, sc: SparkContext): return sc.parallelize(self._extract_all_text_and_audio_path()) def process_data(self, rdd: RDD): return rdd.mapValues(self._process_source_and_target) def _extract_text_and_path(self, line, index): parts = line.strip().split('|') key = parts[0] wav_path = os.path.join(self.in_dir, 'wavs', '%s.wav' % parts[0]) text = parts[2] return TextAndAudioPath(index, key, wav_path, text) def _extract_all_text_and_audio_path(self): index = 1 with open(os.path.join(self.in_dir, 'metadata.csv'), mode='r', encoding='utf-8') as f: for line in f: extracted = self._extract_text_and_path(line, index) if extracted is not None: yield (index, extracted) index += 1 def _process_source_and_target(self, paths: TextAndAudioPath): wav = self.audio.load_wav(paths.wav_path) n_samples = len(wav) mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T n_frames = mel_spectrogram.shape[0] filename = f"{paths.key}.tfrecord" filepath = os.path.join(self.out_dir, filename) tfrecord.write_preprocessed_data(paths.id, paths.key, wav, mel_spectrogram, paths.text, filepath) return SourceAndTargetMetaData(paths.id, paths.key, n_samples, n_frames, filepath) def _process_mel(self, paths: TextAndAudioPath): wav = self.audio.load_wav(paths.wav_path) mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T sum_mel_powers = np.sum(mel_spectrogram, axis=1) n_frames = mel_spectrogram.shape[0] return MelMetaData(n_frames, sum_mel_powers)
class LJSpeech: def __init__(self, in_dir, mel_out_dir, wav_out_dir, hparams): self.in_dir = in_dir self.mel_out_dir = mel_out_dir self.wav_out_dir = wav_out_dir self.audio = Audio(hparams) @property def record_ids(self): return map(lambda v: str(v), range(1, 13101)) def record_file_path(self, record_id, kind): assert kind in ["source", "target"] return os.path.join(self.mel_out_dir, f"ljspeech-{kind}-{int(record_id):05d}.tfrecord") def text_and_path_rdd(self, sc: SparkContext): return sc.parallelize(self._extract_all_text_and_path()) def process_wav(self, rdd: RDD): return rdd.mapValues(self._process_wav) def _extract_text_and_path(self, line, index): parts = line.strip().split('|') key = parts[0] text = parts[2] wav_path = os.path.join(self.in_dir, 'wavs', '%s.wav' % key) return TextAndPath(index, key, wav_path, None, text) def _extract_all_text_and_path(self): with open(os.path.join(self.in_dir, 'metadata.csv'), mode='r', encoding='utf-8') as f: for index, line in enumerate(f): extracted = self._extract_text_and_path(line, index) if extracted is not None: yield (index, extracted) def _process_wav(self, paths: TextAndPath): wav = self.audio.load_wav(paths.wav_path) mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T mel_spectrogram = self.audio.normalize_mel(mel_spectrogram) mel_filepath = os.path.join(self.mel_out_dir, f"{paths.key}.mfbsp") wav_filepath = os.path.join(self.wav_out_dir, f"{paths.key}.wav") mel_spectrogram.tofile(mel_filepath, format="<f4") self.audio.save_wav(wav, wav_filepath)
def main(args): args = { "config": 'config/config.yaml', "embedder_path": 'model/embedder.pt', "checkpoint_path": 'enhance_my_voice/chkpt_201000.pt', "mixed_file": 'utils/speakerA.wav', "reference_file": 'utils/speakerA.wav', "out_dir": 'output', } hp = HParam(args['config']) with torch.no_grad(): model = VoiceFilter(hp).cuda() chkpt_model = torch.load(args['checkpoint_path'])['model'] model.load_state_dict(chkpt_model) model.eval() embedder = SpeechEmbedder(hp).cuda() chkpt_embed = torch.load(args['embedder_path']) embedder.load_state_dict(chkpt_embed) embedder.eval() audio = Audio(hp) dvec_wav, _ = librosa.load(args['reference_file'], sr=16000) dvec_mel = audio.get_mel(dvec_wav) dvec_mel = torch.from_numpy(dvec_mel).float().cuda() dvec = embedder(dvec_mel) dvec = dvec.unsqueeze(0) mixed_wav, _ = librosa.load(args['mixed_file'], sr=16000) mag, phase = audio.wav2spec(mixed_wav) mag = torch.from_numpy(mag).float().cuda() mag = mag.unsqueeze(0) mask = model(mag, dvec) est_mag = mag * mask est_mag = est_mag[0].cpu().detach().numpy() # est_wav = audio.spec2wav(est_mag, phase) # os.makedirs(args['out_dir'], exist_ok=True) # out_path = os.path.join(args['out_dir'], 'result.wav') # librosa.output.write_wav(out_path, est_wav, sr=16000) return audio.spec2wav(est_mag, phase)
def __init__(self, setting_path: str, audio_path: str): """コンストラクタ Args: setting_path (str): 設定ファイルのパス audio_path (str): 音楽ファイルのパス """ with open(setting_path, 'r') as f: cfg = yaml.load(f) self.cf = cf.ChangeFinder(**cfg['change_finder']) self.audio = Audio(cfg['audio'], audio_file_path=audio_path) self.buffer = np.zeros(cfg['model']['buffer_audio_length'], dtype=np.float32) self.buf_num = int(cfg['model']['frame_buf_num']) self.spec_buf = [] self.thr = float(cfg['model']['thr'])
def __init__(self, hp, train): def find_all(data_dir,file_format): return sorted(glob.glob(os.path.join(data_dir, file_format))) self.hp = hp self.train = train self.mixed_dir = hp.data.vfws_dir + 'mixed/train/' if train else hp.data.vfws_dir + 'mixed/test/' self.clean_dir = hp.data.vfws_dir + 'clean/train/' if train else hp.data.vfws_dir + 'clean/test/' self.target_wav_list = find_all(self.clean_dir, hp.form.target.wav) self.mixed_wav_list = find_all(self.mixed_dir, hp.form.mixed.wav) self.target_mag_list = find_all(self.clean_dir, hp.form.target.mag) self.mixed_mag_list = find_all(self.mixed_dir, hp.form.mixed.mag) assert len(self.target_wav_list) == len(self.mixed_wav_list) == \ len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match" self.audio = Audio(hp)
def trainer(model_name): chkpt_path = None #@param device = xm.xla_device() pt_dir = os.path.join('.', config.log['chkpt_dir'], model_name) os.makedirs(pt_dir, exist_ok=True) log_dir = os.path.join('.', config.log['log_dir'], model_name) os.makedirs(log_dir, exist_ok=True) logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler( os.path.join( log_dir, '%s-%d.log' % (model_name, time.time()))), logging.StreamHandler() ]) logger = logging.getLogger() writer = MyWriter(log_dir) trainloader = create_dataloader(train=True) testloader = create_dataloader(train=False) embedder_pt = torch.load( '/drive/content/My Drive/ColabDisk/embedder_cpu.pt') embedder = SpeechEmbedder().to(device) embedder.load_state_dict(embedder_pt) embedder.eval() model = VoiceFilter().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=config.train['adam']) audio = Audio() starting_epoch = 1 if chkpt_path is not None: logger.info("Resuming from checkpoint: %s" % chkpt_path) checkpoint_file = torch.load(chkpt_path) model.load_state_dict(checkpoint_file['model']) optimizer.load_state_dict(checkpoint_file['optimizer']) starting_epoch = checkpoint_file['epoch'] else: logger.info("Starting new training run") for epoch in range(starting_epoch, config.train['epoch'] + 1): para_loader = pl.ParallelLoader(trainloader, [device]).per_device_loader(device) train(embedder, model, optimizer, para_loader, writer, logger, epoch, pt_dir, device) xm.master_print("Finished training epoch {}".format(epoch)) logger.info("Starting to validate epoch...") para_loader = pl.ParallelLoader(testloader, [device]).per_device_loader(device) validate(audio, model, embedder, para_loader, writer, epoch, device) model_saver(model, optimizer, pt_dir, config.train['epoch'])
def __init__(self, train): def find_all(file_format): return sorted(glob.glob(os.path.join(self.data_dir, file_format))) self.train = train self.data_dir = config.data['records_dir'] + config.data['train_dir'] if train else config.data['records_dir'] + config.data['test_dir'] self.dvec_list = find_all(config.form['dvec']) self.target_wav_list = find_all(config.form['target']['wav']) self.mixed_wav_list = find_all(config.form['mixed']['wav']) self.target_mag_list = find_all(config.form['target']['mag']) self.mixed_mag_list = find_all(config.form['mixed']['mag']) assert len(self.dvec_list) == len(self.target_wav_list) == len(self.mixed_wav_list) == \ len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match" assert len(self.dvec_list) != 0, \ "no training file found" self.audio = Audio()
def __init__(self, config: configparser, debug: bool): super().__init__(command_prefix=determine_prefix, description="NerdyBot - Always one step ahead!") self.config = config self.debug = debug self.client_id = config["bot"]["client_id"] self.token = config["bot"]["token"] self.ops = config["bot"]["ops"] self.moderator_role = config["bot"]["moderator_role_name"] self.modules = json.loads(config["bot"]["modules"]) self.restart = True self.log = self._get_logger() self.uptime = datetime.utcnow() self.audio = Audio(self) self.last_cmd_cache = {} self.usr_cmd_err_spam = {} self.usr_cmd__err_spam_threshold = int( config["bot"]["error_spam_threshold"]) self.convMan = ConversationManager(self) # database variables if "database" not in config: self.log.error( "No Database specified! Fallback to local SQLite Database!") db_connection_string = "sqlite:///db.db" else: database_config = config["database"] db_type = database_config["db_type"] db_name = database_config["db_name"] db_username = "" db_password = "" db_host = "" db_port = "" if any(s in db_type for s in ("mysql", "mariadb")): db_type = f'{database_config["db_type"]}+pymysql' if "db_password" in database_config and database_config[ "db_password"]: db_password = f':{database_config["db_password"]}' if "db_username" in database_config and database_config[ "db_username"]: db_username = database_config["db_username"] if "db_host" in database_config and database_config["db_host"]: db_host = f'@{database_config["db_host"]}' if "db_port" in database_config and database_config["db_port"]: db_port = f':{database_config["db_port"]}' db_authentication = f"{db_username}{db_password}{db_host}{db_port}" db_connection_string = f"{db_type}://{db_authentication}/{db_name}" self.ENGINE = create_engine(db_connection_string, echo=self.debug) self.SESSION = sessionmaker(bind=self.ENGINE, expire_on_commit=False) self.create_all() self._import_modules()
def main(args, hp): with open('out1.txt') as f: for line in f: res = line.split('\t') with torch.no_grad(): model = VoiceFilter(hp) chkpt_model = torch.load(args.checkpoint_path, map_location='cpu')['model'] model.load_state_dict(chkpt_model) model.eval() embedder = SpeechEmbedder(hp) chkpt_embed = torch.load(args.embedder_path, map_location='cpu') embedder.load_state_dict(chkpt_embed) embedder.eval() audio = Audio(hp) dvec_wav, _ = librosa.load(res[1], sr=16000) dvec_mel = audio.get_mel(dvec_wav) dvec_mel = torch.from_numpy(dvec_mel).float() dvec = embedder(dvec_mel) dvec = dvec.unsqueeze(0) mixed_wav, _ = librosa.load(res[0], sr=16000) mag, phase = audio.wav2spec(mixed_wav) mag = torch.from_numpy(mag).float() mag = mag.unsqueeze(0) mask = model(mag, dvec) est_mag = mag * mask est_mag = est_mag[0].cpu().detach().numpy() est_wav = audio.spec2wav(est_mag, phase) os.makedirs('/root/voicefilter/res', exist_ok=True) out_path = os.path.join('/root/voicefilter/res', f'{res[2]}') librosa.output.write_wav(out_path, est_wav, sr=16000)
def main(audio_filepath, text_filepath): text = get_text(text_filepath) token_clean = '\.|,|;|:|\?|!|\.\.\.' tokenized_text = ' '.join(tokenize(text)) clean_text = re.sub(token_clean,'',tokenized_text).lower() audio_file = Audio(audio_filepath) trimmer = Trimmer(clean_text, audio_file) start, end, start_word_index, end_word_index = trimmer.crop_longaudio() if start and end: if end_word_index == None: end_word_index = -1 else: end_word_index += 1 print(start, end, tokenized_text.split()[start_word_index], tokenized_text.split()[end_word_index])
def prop_media(candidates, base_path, out_path='for_axlotl'): axlotl_input = [] for session_id, session in candidates.items(): result_top_path = os.path.join(out_path, session_id) if not os.path.isdir(result_top_path): os.makedirs(result_top_path) for text_path, contents in session.items(): # the url element should always have a single file paths = create_local_paths(base_path, (text_path, '', contents['urls'][0][1])) result_basename = os.path.basename(paths['audio_path']).\ split('.')[0] yaml_name = os.path.basename(text_path).split('.')[0] text_filename = '-'.join([session_id, yaml_name, result_basename])+'.txt' result_text = os.path.join(result_top_path, text_filename) # if audio file exists if os.path.isfile(paths['audio_path']): if os.path.isfile(result_text): msg = 'skipping. processed text file %s exists'%result_text #logging.info(msg) else: # if there is one or two speaker in the intervention if len(contents['text']) < 3: text = ' '.join([text[1] for text in contents['text']]) audio_file = Audio(paths['audio_path']) wps = len(text.split())/audio_file.duration*60 # if wps reasonable accept as an axlotl input if wps < 95. or wps > 195: msg = '%s wps is not reasonable: %4.2f. skipping'\ %(text_path, wps) logging.warning(msg) else: with open(result_text, 'w') as out: out.write(text) if os.path.isfile(result_text): #logging.info('text, audio: %s,%s'%(result_text, # paths['audio_path'])) axlotl_input.append((result_text,paths['audio_path'])) with open('axlotl_input.csv', 'w') as out: for text, audio in axlotl_input: out.write('%s,%s\n'%(os.path.abspath(text), os.path.abspath(audio)))
class VFWSDataset(Dataset): def __init__(self, hp, train): def find_all(data_dir,file_format): return sorted(glob.glob(os.path.join(data_dir, file_format))) self.hp = hp self.train = train self.mixed_dir = hp.data.vfws_dir + 'mixed/train/' if train else hp.data.vfws_dir + 'mixed/test/' self.clean_dir = hp.data.vfws_dir + 'clean/train/' if train else hp.data.vfws_dir + 'clean/test/' self.target_wav_list = find_all(self.clean_dir, hp.form.target.wav) self.mixed_wav_list = find_all(self.mixed_dir, hp.form.mixed.wav) self.target_mag_list = find_all(self.clean_dir, hp.form.target.mag) self.mixed_mag_list = find_all(self.mixed_dir, hp.form.mixed.mag) assert len(self.target_wav_list) == len(self.mixed_wav_list) == \ len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match" self.audio = Audio(hp) def __len__(self): return len(self.target_mag_list) def __getitem__(self, idx): if self.train : # need to be fast target_mag = torch.load(self.target_mag_list[idx]) mixed_mag = torch.load(self.mixed_mag_list[idx]) return target_mag, mixed_mag else: target_wav, _ = librosa.load(self.target_wav_list[idx], self.hp.audio.sample_rate) mixed_wav, _ = librosa.load(self.mixed_wav_list[idx], self.hp.audio.sample_rate) target_mag, _ = self.wav2magphase(self.target_wav_list[idx]) mixed_mag, mixed_phase = self.wav2magphase(self.mixed_wav_list[idx]) target_mag = torch.from_numpy(target_mag) mixed_mag = torch.from_numpy(mixed_mag) # mixed_phase = torch.from_numpy(mixed_phase) return target_wav, mixed_wav, target_mag, mixed_mag, mixed_phase def wav2magphase(self, path): wav, _ = librosa.load(path, self.hp.audio.sample_rate) mag, phase = self.audio.wav2spec(wav) return mag, phase
if os.path.isdir(x)] test_folders = [x for x in glob.glob(os.path.join(args.current_corpus_dir, 'test')) if os.path.isdir(x)] #train_spk = all files in train_folders train_spk = [glob.glob(os.path.join(spk, '**', hp.form.input), recursive=True) for spk in train_folders] train_spk = [x for x in train_spk if len(x) >= 2] #test_spk = all files in test_folders test_spk = [glob.glob(os.path.join(spk, '**', hp.form.input), recursive=True) for spk in test_folders] test_spk = [x for x in test_spk if len(x) >= 2] audio = Audio(hp) def train_wrapper(num): '''Randomly chose 2 speakers from training set and mix them''' spk1, spk2 = random.sample(train_spk, 2) s1_dvec, s1_target = random.sample(spk1, 2) s2 = random.choice(spk2) mix(hp, args, audio, num, s1_dvec, s1_target, s2, train=True) def test_wrapper(num): '''Randomly chose 2 speakers from testing set and mix them''' spk1, spk2 = random.sample(test_spk, 2) s1_dvec, s1_target = random.sample(spk1, 2) s2 = random.choice(spk2) mix(hp, args, audio, num, s1_dvec, s1_target, s2, train=False)
ref_mel, eliminated_wav, mixed_wav, expected_hidden_wav, eliminated_mag, expected_hidden_mag, mixed_mag, mixed_phase, dvec_path, eliminated_wav_path, mixed_wav_path = \ batch[0] # print("expected_focused: {}".format(expected_focused_wav_path)) print("Mixed: {}".format(mixed_wav_path)) model = VoiceFilter(hp).cuda() chkpt_model = torch.load(args.checkpoint_path, map_location='cuda:0')['model'] model.load_state_dict(chkpt_model) model.eval() embedder = SpeechEmbedder(hp).cuda() chkpt_embed = torch.load(args.embedder_path) embedder.load_state_dict(chkpt_embed) embedder.eval() audio = Audio(hp) dvec_wav, _ = librosa.load(dvec_path, sr=16000) ref_mel = audio.get_mel(dvec_wav) ref_mel = torch.from_numpy(ref_mel).float().cuda() dvec = embedder(ref_mel) dvec = dvec.unsqueeze(0) # (1, 256) mixed_wav, _ = librosa.load(mixed_wav_path, sr=16000) mixed_mag, mixed_phase = audio.wav2spec(mixed_wav) mixed_mag = torch.from_numpy(mixed_mag).float().cuda() mixed_mag = mixed_mag.unsqueeze(0) shadow_mag = model(mixed_mag, dvec) # shadow_mag.size() = [1, 301, 601]
class VCTK: def __init__(self, in_dir, out_dir, hparams, speaker_info_filename='speaker-info.txt'): self.in_dir = in_dir self.out_dir = out_dir self.speaker_info_filename = speaker_info_filename self.audio = Audio(hparams) def list_files(self): def wav_files(speaker_info: SpeakerInfo): wav_dir = os.path.join(self.in_dir, f"wav48/p{speaker_info.id}") return [ os.path.join(wav_dir, wav_file) for wav_file in sorted(os.listdir(wav_dir)) if wav_file.endswith('.wav') ] def text_files(speaker_info: SpeakerInfo): txt_dir = os.path.join(self.in_dir, f"txt/p{speaker_info.id}") return [ os.path.join(txt_dir, txt_file) for txt_file in sorted(os.listdir(txt_dir)) if txt_file.endswith('.txt') ] def text_and_wav_records(file_pairs, speaker_info): def create_record(txt_f, wav_f, speaker_info): key1 = os.path.basename(wav_f).strip('.wav') key2 = os.path.basename(txt_f).strip('.txt') assert key1 == key2 return TxtWavRecord(0, key1, txt_f, wav_f, speaker_info) return [ create_record(txt_f, wav_f, speaker_info) for txt_f, wav_f in file_pairs ] records = sum([ text_and_wav_records(zip(text_files(si), wav_files(si)), si) for si in self._load_speaker_info() ], []) return [ TxtWavRecord(i, r.key, r.txt_path, r.wav_path, r.speaker_info) for i, r in enumerate(records) ] def process_sources(self, rdd: RDD): return rdd.map(self._process_txt) def process_targets(self, rdd: RDD): return TargetRDD( rdd.map(self._process_wav).persist(StorageLevel.MEMORY_AND_DISK)) def _load_speaker_info(self): with open(os.path.join(self.in_dir, self.speaker_info_filename), mode='r', encoding='utf8') as f: for l in f.readlines()[1:]: si = l.split() gender = 0 if si[2] == 'F' else 1 if str(si[0]) != "315": # FixMe: Why 315 is missing? yield SpeakerInfo(int(si[0]), int(si[1]), gender) def _process_wav(self, record: TxtWavRecord): wav = self.audio.load_wav(record.wav_path) wav = self.audio.trim(wav) mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T file_path = os.path.join(self.out_dir, f"{record.key}.target.tfrecord") write_preprocessed_target_data(record.id, record.key, mel_spectrogram, file_path) return MelStatistics(id=record.id, key=record.key, min=np.min(mel_spectrogram, axis=0), max=np.max(mel_spectrogram, axis=0), sum=np.sum(mel_spectrogram, axis=0), length=len(mel_spectrogram), moment2=np.sum(np.square(mel_spectrogram), axis=0)) def _process_txt(self, record: TxtWavRecord): with open(os.path.join(self.in_dir, record.txt_path), mode='r', encoding='utf8') as f: txt = f.readline().rstrip("\n") sequence, clean_text = text_to_sequence(txt, basic_cleaners) source = np.array(sequence, dtype=np.int64) file_path = os.path.join(self.out_dir, f"{record.key}.source.tfrecord") write_preprocessed_source_data(record.id, record.key, source, clean_text, record.speaker_info.id, record.speaker_info.age, record.speaker_info.gender, file_path) return record.key
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch Voice Filter') parser.add_argument('-b', '--base_dir', type=str, default='.', help="Root directory of run.") parser.add_argument('--checkpoint_path', type=str, default=None, help='Path to last checkpoint') parser.add_argument('-e', '--embedder_path', type=str, required=True, help="path of embedder model pt file") parser.add_argument( '-m', '--model', type=str, required=True, help="Name of the model. Used for both logging and saving checkpoints." ) args = parser.parse_args() chkpt_path = args.checkpoint_path if args.checkpoint_path is not None else None pt_dir = os.path.join(args.base_dir, config.log['chkpt_dir'], args.model) os.makedirs(pt_dir, exist_ok=True) log_dir = os.path.join(args.base_dir, config.log['log_dir'], args.model) os.makedirs(log_dir, exist_ok=True) logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler( os.path.join( log_dir, '%s-%d.log' % (args.model, time.time()))), logging.StreamHandler() ]) logger = logging.getLogger() writer = MyWriter(log_dir) trainloader = create_dataloader(train=True) testloader = create_dataloader(train=False) embedder_pt = torch.load(args.embedder_path) embedder = SpeechEmbedder().cuda() embedder.load_state_dict(embedder_pt) embedder.eval() model = nn.DataParallel(VoiceFilter()) optimizer = torch.optim.Adam(model.parameters(), lr=config.train['adam']) audio = Audio() starting_step = 0 starting_epoch = 1 if chkpt_path is not None: logger.info("Resuming from checkpoint: %s" % chkpt_path) checkpoint_file = torch.load(chkpt_path) model.load_state_dict(checkpoint_file['model']) starting_epoch = checkpoint_file['epoch'] starting_step = checkpoint_file['step'] else: logger.info("Starting new training run") scheduler = StepLR(optimizer, step_size=1, gamma=0.7) for epoch in range(starting_epoch, config.train['epoch'] + 1): train(embedder, model, optimizer, trainloader, writer, logger, epoch, pt_dir, starting_step) validate(audio, model, embedder, testloader, writer, epoch) scheduler.step() starting_step = 0 model_saver(model, pt_dir, config.train['epoch'], config.train['train_step_pre_epoch'])
def __init__(self, in_dir, out_dir, hparams): self.in_dir = in_dir self.out_dir = out_dir self.audio = Audio(hparams)
phonemes.extend(batch) audio_data = np.concatenate([np.array(audio_data), np.expand_dims(phonemes, axis=1)], axis=1) if args.CACHE_PHON: np.save(phon_path, audio_data, allow_pickle=True) print('\nBuilding dataset and writing files') np.random.seed(42) np.random.shuffle(audio_data) test_metafile = os.path.join(args.TARGET_DIR, 'test_metafile.txt') train_metafile = os.path.join(args.TARGET_DIR, 'train_metafile.txt') test_lines = [''.join([filename, '|', text, '|', phon, '\n']) for filename, text, phon in audio_data[:config['n_test']]] train_lines = [''.join([filename, '|', text, '|', phon, '\n']) for filename, text, phon in audio_data[config['n_test']:-1]] with open(test_metafile, 'w+', encoding='utf-8') as test_f: test_f.writelines(test_lines) with open(train_metafile, 'w+', encoding='utf-8') as train_f: train_f.writelines(train_lines) audio = Audio(config) for i in tqdm.tqdm(range(len(audio_data))): filename, _, _ = audio_data[i] wav_path = os.path.join(args.WAV_DIR, filename.replace('"', '') + '.wav') y, sr = librosa.load(wav_path, sr=config['sampling_rate']) mel = audio.mel_spectrogram(y) mel_path = os.path.join(mel_dir, filename) np.save(mel_path, mel.T) print('\nDone')