from utils.audio import Audio np.random.seed(42) parser = argparse.ArgumentParser() parser.add_argument('--config', type=str, required=True) parser.add_argument('--skip_phonemes', action='store_true') parser.add_argument('--skip_mels', action='store_true') parser.add_argument('--phonemizer_parallel_jobs', type=int, default=16) parser.add_argument('--phonemizer_batch_size', type=int, default=16) args = parser.parse_args() for arg in vars(args): print('{}: {}'.format(arg, getattr(args, arg))) cm = Config(args.config, model_kind='autoregressive') cm.create_remove_dirs() metadatareader = DataReader.from_config(cm, kind='original', scan_wavs=True) if not args.skip_mels: import sys def process_wav(wav_path: Path): file_name = wav_path.stem y, sr = audio.load_wav(str(wav_path)) mel = audio.mel_spectrogram(y) assert mel.shape[1] == audio.config['mel_channels'], len( mel.shape) == 2 mel_path = (cm.mel_dir / file_name).with_suffix('.npy') np.save(mel_path, mel) return (file_name, mel.shape[0])
from utils.config_manager import Config from data.audio import Audio np.random.seed(42) parser = argparse.ArgumentParser() parser.add_argument('--config', type=str, default='config/session_paths.yaml') parser.add_argument('--skip_phonemes', action='store_true') parser.add_argument('--skip_mels', action='store_true') parser.add_argument('--skip_speakers', action='store_true') args = parser.parse_args() for arg in vars(args): print('{}: {}'.format(arg, getattr(args, arg))) cm = Config(args.config, asr=True) cm.create_remove_dirs() metadatareader = DataReader.from_config(cm, kind='original') summary_manager = SummaryManager(model=None, log_dir=cm.log_dir / 'data_preprocessing', config=cm.config, default_writer='data_preprocessing') print(f'\nFound {len(metadatareader.filenames)} audio files.') audio = Audio(config=cm.config) if not args.skip_mels: def process_file(tuples): len_dict = {} spk_file_dict = {} remove_files = []
from data.datasets import ASRDataset from utils.logging_utils import SummaryManager from utils.scripts_utils import dynamic_memory_allocation, basic_train_parser from ctc_segmentation import ctc_segmentation, determine_utterance_segments from ctc_segmentation import CtcSegmentationParameters from ctc_segmentation import prepare_token_list import tgt np.random.seed(42) tf.random.set_seed(42) dynamic_memory_allocation() parser = basic_train_parser() args = parser.parse_args() config = Config(config_path=args.config, asr=True) config_dict = config.config config.create_remove_dirs(clear_dir=args.clear_dir, clear_logs=args.clear_logs, clear_weights=args.clear_weights) config.dump_config() config.print_config() model = config.get_model() config.compile_model(model) data_handler = ASRDataset.from_config(config, tokenizer=model.text_pipeline.tokenizer, kind='valid') dataset = data_handler.get_dataset( bucket_batch_sizes=config_dict['bucket_batch_sizes'],
summary_manager.display_loss(model_out, tag='Validation', plot_all=True) summary_manager.display_attention_heads(model_out, tag='ValidationAttentionHeads') # summary_manager.display_mel(mel=model_out['mel_linear'][0], tag=f'Validation/linear_mel_out') summary_manager.display_mel( mel=model_out['final_output'][0], tag=f'Validation/predicted_mel_{fname[0].numpy().decode("utf-8")}') # residual = abs(model_out['mel_linear'] - model_out['final_output']) # summary_manager.display_mel(mel=residual[0], tag=f'Validation/conv-linear_residual') summary_manager.display_mel( mel=val_mel[0], tag=f'Validation/target_mel_{fname[0].numpy().decode("utf-8")}') return val_loss['loss'] config_manager = Config(config_path=args.config, model_kind='autoregressive') config = config_manager.config config_manager.create_remove_dirs(clear_dir=args.clear_dir, clear_logs=args.clear_logs, clear_weights=args.clear_weights) config_manager.dump_config() config_manager.print_config() # # get model, prepare data for model, create datasets model = config_manager.get_model() config_manager.compile_model(model) data_prep = AutoregressivePreprocessor.from_config( config_manager, tokenizer=model.text_pipeline.tokenizer) train_data_handler = TextMelDataset.from_config(config_manager, preprocessor=data_prep,
summary_manager.display_mel( mel=tar_value, tag=f'Test/{fname[j].numpy().decode("utf-8")}/target') summary_manager.display_audio( tag=f'Prediction {fname[j].numpy().decode("utf-8")}/target', mel=tar_value) summary_manager.display_audio( tag=f'Prediction {fname[j].numpy().decode("utf-8")}/prediction', mel=predval) return val_loss['loss'] parser = basic_train_parser() args = parser.parse_args() config = Config(config_path=args.config) config_dict = config.config config.create_remove_dirs(clear_dir=args.clear_dir, clear_logs=args.clear_logs, clear_weights=args.clear_weights) config.dump_config() config.print_config() model = config.get_model() config.compile_model(model) data_prep = TTSPreprocessor.from_config( config=config, tokenizer=model.text_pipeline.tokenizer) train_data_handler = TTSDataset.from_config(config, preprocessor=data_prep, kind='train')
if args.file is not None: with open(args.file, 'r') as file: text = file.readlines() fname = Path(args.file).stem elif args.text is not None: text = [args.text] fname = 'custom_text' else: fname = None text = None print( f'Specify either an input text (-t "some text") or a text input file (-f /path/to/file.txt)' ) exit() config_loader = Config(config_path=args.config) outdir = Path( args.outdir) if args.outdir is not None else config_loader.log_dir outdir = outdir / 'outputs' / f'{fname}' outdir.mkdir(exist_ok=True, parents=True) print('===' * 10, outdir) audio = Audio(config_loader.config) if args.checkpoint is not None: all_weights = [args.checkpoint] elif args.all_weights: all_weights = [(config_loader.weights_dir / x.stem).as_posix() for x in config_loader.weights_dir.iterdir() if x.suffix == '.index'] else: all_weights = [None] # default
assert (args.fill_mode_max is False) or (args.fill_mode_next is False), 'Choose one gap filling mode.' weighted = not args.best binary = args.binary fill_gaps = args.fill_mode_max or args.fill_mode_next fix_jumps = args.fix_jumps fill_mode = f"{f'max' * args.fill_mode_max}{f'next' * args.fill_mode_next}" filling_tag = f"{f'(max)' * args.fill_mode_max}{f'(next)' * args.fill_mode_next}" tag_description = ''.join([ f'{"_weighted" * weighted}{"_best" * (not weighted)}', f'{"_binary" * binary}', f'{"_filled" * fill_gaps}{filling_tag}', f'{"_fix_jumps" * fix_jumps}', f'_layer{args.extract_layer}' ]) writer_tag = f'DurationExtraction{tag_description}' print(writer_tag) config_manager = Config(config_path=args.config, model_kind='autoregressive') config = config_manager.config config_manager.print_config() if args.autoregressive_weights != '': model = config_manager.load_model(args.autoregressive_weights) else: model = config_manager.load_model() if model.r != 1: print( f"ERROR: model's reduction factor is greater than 1, check config. (r={model.r}" ) data_prep = AutoregressivePreprocessor.from_config( config=config_manager, tokenizer=model.text_pipeline.tokenizer) data_handler = TextMelDataset.from_config(config_manager, preprocessor=data_prep,