def test_load(config_directory): path = os.path.join(config_directory, 'basic_train_config.yaml') train, align = train_yaml_to_config(path) assert len(train.training_configs) == 4 assert isinstance(train.training_configs[0], MonophoneTrainer) assert isinstance(train.training_configs[1], TriphoneTrainer) assert isinstance(train.training_configs[-1], SatTrainer) path = os.path.join(config_directory, 'out_of_order_config.yaml') with pytest.raises(ConfigError): train, align = train_yaml_to_config(path)
def test_load_mono_train(config_directory, mono_train_config_path): train, align = train_yaml_to_config(mono_train_config_path) for t in train.training_configs: assert not t.use_mp assert not t.feature_config.use_mp assert not align.use_mp assert not align.feature_config.use_mp
def test_load_ivector_train(config_directory, train_ivector_config): train, align = train_yaml_to_config(train_ivector_config) for t in train.training_configs: assert not t.use_mp assert not t.feature_config.use_mp assert t.feature_config.use_energy assert not align.use_mp assert not align.feature_config.use_mp
def ivector_train_config(config_directory): return train_yaml_to_config( os.path.join(config_directory, 'ivector_train.yaml'))
def lda_sat_train_config(config_directory): return train_yaml_to_config( os.path.join(config_directory, 'lda_sat_train.yaml'))
def mono_train_config(mono_train_config_path): return train_yaml_to_config(mono_train_config_path)
def nnet_ivectors_train_config(config_directory): return train_yaml_to_config( os.path.join(config_directory, 'nnet_ivectors_train.yaml'))
def train_ivector(args): command = 'train_ivector' all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) if args.config_path: train_config, align_config = train_yaml_to_config(args.config_path) else: train_config, align_config = load_basic_train_ivector() conf_path = os.path.join(data_directory, 'config.yml') if getattr(args, 'clean', False) and os.path.exists(data_directory): print('Cleaning old directory!') shutil.rmtree(data_directory, ignore_errors=True) logger = setup_logger(command, data_directory) if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f, Loader=yaml.SafeLoader) else: conf = { 'dirty': False, 'begin': all_begin, 'version': __version__, 'type': command, 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path, 'acoustic_model_path': args.acoustic_model_path, } if conf['dirty'] or conf['type'] != command \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path \ or conf['acoustic_model_path'] != args.acoustic_model_path: logger.warning( 'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no ' 'weird behavior for previous versions of the temporary directory.') if conf['dirty']: logger.debug('Previous run ended in an error (maybe ctrl-c?)') if conf['type'] != command: logger.debug( 'Previous run was a different subcommand than {} (was {})'. format(command, conf['type'])) if conf['corpus_directory'] != args.corpus_directory: logger.debug('Previous run used source directory ' 'path {} (new run: {})'.format( conf['corpus_directory'], args.corpus_directory)) if conf['version'] != __version__: logger.debug('Previous run was on {} version (new run: {})'.format( conf['version'], __version__)) if conf['dictionary_path'] != args.dictionary_path: logger.debug('Previous run used dictionary path {} ' '(new run: {})'.format(conf['dictionary_path'], args.dictionary_path)) if conf['acoustic_model_path'] != args.acoustic_model_path: logger.debug('Previous run used acoustic model path {} ' '(new run: {})'.format(conf['acoustic_model_path'], args.acoustic_model_path)) os.makedirs(data_directory, exist_ok=True) try: begin = time.time() corpus = AlignableCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=args.num_jobs, debug=getattr(args, 'debug', False), logger=logger, use_mp=align_config.use_mp) acoustic_model = AcousticModel(args.acoustic_model_path) dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set, logger=logger) acoustic_model.validate(dictionary) a = PretrainedAligner(corpus, dictionary, acoustic_model, align_config, temp_directory=data_directory, logger=logger) logger.debug( 'Setup pretrained aligner in {} seconds'.format(time.time() - begin)) a.verbose = args.verbose begin = time.time() a.align() logger.debug('Performed alignment in {} seconds'.format(time.time() - begin)) for identifier, trainer in train_config.items(): trainer.logger = logger if identifier != 'ivector': continue begin = time.time() trainer.init_training(identifier, data_directory, corpus, dictionary, a) trainer.train(call_back=print) logger.debug('Training took {} seconds'.format(time.time() - begin)) trainer.save(args.output_model_path) logger.info('All done!') logger.debug('Done! Everything took {} seconds'.format(time.time() - all_begin)) except Exception as e: conf['dirty'] = True raise e finally: handlers = logger.handlers[:] for handler in handlers: handler.close() logger.removeHandler(handler) with open(conf_path, 'w') as f: yaml.dump(conf, f)
def train_ivector(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) conf_path = os.path.join(data_directory, 'config.yml') if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f, Loader=yaml.SafeLoader) else: conf = { 'dirty': False, 'begin': time.time(), 'version': __version__, 'type': 'train_and_align', 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path } if getattr(args, 'clean', False) \ or conf['dirty'] or conf['type'] != 'train_and_align' \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path: shutil.rmtree(data_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) try: corpus = AlignableCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=getattr(args, 'num_jobs', 3), debug=getattr(args, 'debug', False)) if corpus.issues_check: print('WARNING: Some issues parsing the corpus were detected. ' 'Please run the validator to get more information.') dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) utt_oov_path = os.path.join(corpus.split_directory(), 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, args.output_directory) oov_path = os.path.join(corpus.split_directory(), 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, args.output_directory) if args.config_path: train_config, align_config = train_yaml_to_config(args.config_path) else: train_config, align_config = load_basic_train_ivector() a = TrainableAligner(corpus, dictionary, train_config, align_config, temp_directory=data_directory) a.verbose = args.verbose a.train() a.save(args.output_model_path) except Exception as e: conf['dirty'] = True raise e finally: with open(conf_path, 'w') as f: yaml.dump(conf, f)
def align_corpus(args, unknown_args=None): command = 'train_and_align' all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) logger = setup_logger(command, data_directory) if args.config_path: train_config, align_config = train_yaml_to_config(args.config_path) else: train_config, align_config = load_basic_train() if unknown_args: align_config.update_from_args(unknown_args) conf_path = os.path.join(data_directory, 'config.yml') if args.debug: logger.warning( 'Running in DEBUG mode, may have impact on performance and disk usage.' ) if getattr(args, 'clean', False) and os.path.exists(data_directory): logger.info('Cleaning old directory!') shutil.rmtree(data_directory, ignore_errors=True) if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f, Loader=yaml.SafeLoader) else: conf = { 'dirty': False, 'begin': time.time(), 'version': __version__, 'type': command, 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path } if conf['dirty'] or conf['type'] != command \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path: logger.warning( 'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no ' 'weird behavior for previous versions of the temporary directory.') if conf['dirty']: logger.debug('Previous run ended in an error (maybe ctrl-c?)') if conf['type'] != command: logger.debug( 'Previous run was a different subcommand than {} (was {})'. format(command, conf['type'])) if conf['corpus_directory'] != args.corpus_directory: logger.debug('Previous run used source directory ' 'path {} (new run: {})'.format( conf['corpus_directory'], args.corpus_directory)) if conf['version'] != __version__: logger.debug('Previous run was on {} version (new run: {})'.format( conf['version'], __version__)) if conf['dictionary_path'] != args.dictionary_path: logger.debug('Previous run used dictionary path {} ' '(new run: {})'.format(conf['dictionary_path'], args.dictionary_path)) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) try: corpus = AlignableCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=getattr(args, 'num_jobs', 3), debug=getattr(args, 'debug', False), logger=logger, use_mp=align_config.use_mp) if corpus.issues_check: logger.warning('Some issues parsing the corpus were detected. ' 'Please run the validator to get more information.') logger.info(corpus.speaker_utterance_info()) dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set, logger=logger) utt_oov_path = os.path.join(corpus.split_directory(), 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, args.output_directory) oov_path = os.path.join(corpus.split_directory(), 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, args.output_directory) a = TrainableAligner(corpus, dictionary, train_config, align_config, temp_directory=data_directory, logger=logger, debug=getattr(args, 'debug', False)) a.verbose = args.verbose begin = time.time() a.train() logger.debug('Training took {} seconds'.format(time.time() - begin)) a.export_textgrids(args.output_directory) if args.output_model_path is not None: a.save(args.output_model_path) logger.info('All done!') logger.debug('Done! Everything took {} seconds'.format(time.time() - all_begin)) except Exception as _: conf['dirty'] = True raise finally: handlers = logger.handlers[:] for handler in handlers: handler.close() logger.removeHandler(handler) with open(conf_path, 'w') as f: yaml.dump(conf, f)