def test_basic(basic_dict_path, basic_corpus_dir, generated_dir, default_feature_config): dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) dictionary.write() output_directory = os.path.join(generated_dir, 'basic') c = AlignableCorpus(basic_corpus_dir, output_directory) c.initialize_corpus(dictionary) default_feature_config.generate_features(c) assert c.get_feat_dim(default_feature_config) == 39
def test_basic(basic_dict_path, generated_dir): d = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) d.write() assert set(d.phones) == {'sil', 'sp', 'spn', 'phonea', 'phoneb', 'phonec'} assert set(d.positional_nonsil_phones) == { 'phonea_B', 'phonea_I', 'phonea_E', 'phonea_S', 'phoneb_B', 'phoneb_I', 'phoneb_E', 'phoneb_S', 'phonec_B', 'phonec_I', 'phonec_E', 'phonec_S' }
def test_stereo(basic_dict_path, stereo_corpus_dir, temp_dir, default_feature_config): temp = os.path.join(temp_dir, 'stereo') dictionary = Dictionary(basic_dict_path, os.path.join(temp, 'basic')) dictionary.write() d = AlignableCorpus(stereo_corpus_dir, temp) d.initialize_corpus(dictionary) default_feature_config.generate_features(d) assert d.get_feat_dim(default_feature_config) == 39
def test_basic_txt(basic_corpus_txt_dir, basic_dict_path, generated_dir, default_feature_config): dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) dictionary.write() output_directory = os.path.join(generated_dir, 'basic') c = AlignableCorpus(basic_corpus_txt_dir, output_directory) assert len(c.no_transcription_files) == 0 c.initialize_corpus(dictionary) default_feature_config.generate_features(c) assert c.get_feat_dim(default_feature_config) == 39
def test_subset(large_prosodylab_format_directory, temp_dir, large_dataset_dictionary, default_feature_config): output_directory = os.path.join(temp_dir, 'large_subset') shutil.rmtree(output_directory, ignore_errors=True) d = Dictionary(large_dataset_dictionary, output_directory) d.write() c = AlignableCorpus(large_prosodylab_format_directory, output_directory) c.initialize_corpus(d) sd = c.split_directory() default_feature_config.generate_features(c) s = c.subset_directory(10, default_feature_config) assert os.path.exists(sd) assert os.path.exists(s)
def test_transcribe_from_temp(basic_corpus_txt_dir, basic_dict_path, generated_dir, default_feature_config): dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) dictionary.write() output_directory = os.path.join(generated_dir, 'basic') c = TranscribeCorpus(basic_corpus_txt_dir, output_directory, use_mp=False) c.initialize_corpus(dictionary) default_feature_config.generate_features(c) assert c.get_feat_dim(default_feature_config) == 39 c = TranscribeCorpus(basic_corpus_txt_dir, output_directory, use_mp=False) c.initialize_corpus(dictionary) default_feature_config.generate_features(c) assert c.get_feat_dim(default_feature_config) == 39
def test_short_segments(basic_dict_path, shortsegments_corpus_dir, temp_dir, default_feature_config): temp = os.path.join(temp_dir, 'short_segments') dictionary = Dictionary(basic_dict_path, temp) dictionary.write() corpus = AlignableCorpus(shortsegments_corpus_dir, temp) corpus.initialize_corpus(dictionary) default_feature_config.generate_features(corpus) assert len(corpus.feat_mapping.keys()) == 2 assert len(corpus.utt_speak_mapping.keys()) == 3 assert len(corpus.speak_utt_mapping.keys()) == 1 assert len(corpus.text_mapping.keys()) == 3 assert len(corpus.utt_wav_mapping.keys()) == 1 assert len(corpus.segments.keys()) == 3 assert len(corpus.ignored_utterances) == 1
def test_weird_words(weird_words_dir, temp_dir, sick_dict_path): output_directory = os.path.join(temp_dir, 'weird_words') shutil.rmtree(output_directory, ignore_errors=True) d = Dictionary(sick_dict_path, output_directory) assert 'i’m' not in d.words assert '’m' not in d.words assert d.words["i'm"][0]['pronunciation'] == ('ay', 'm', 'ih') assert d.words["i'm"][1]['pronunciation'] == ('ay', 'm') assert d.words["'m"][0]['pronunciation'] == ('m',) d.write() c = AlignableCorpus(weird_words_dir, output_directory, use_mp=False) c.initialize_corpus(d) print(c.utterance_oovs['weird_words']) assert c.utterance_oovs['weird_words'] == ['ajfish', 'asds-asda', 'sdasd']
def validate_corpus(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) shutil.rmtree(data_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) corpus = AlignableCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=getattr(args, 'num_jobs', 3)) dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) a = CorpusValidator(corpus, dictionary, temp_directory=data_directory, ignore_acoustics=getattr(args, 'ignore_acoustics', False), test_transcriptions=getattr(args, 'test_transcriptions', False), use_mp=not args.disable_mp) a.validate()
def train_lm(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) corpus = AlignableCorpus(args.corpus_directory, data_directory) if args.config_path: train_config = train_lm_yaml_to_config(args.config_path) else: train_config = load_basic_train_lm() if args.dictionary_path is not None: dictionary = Dictionary(args.dictionary_path, data_directory) else: dictionary = None trainer = LmTrainer(corpus, train_config, args.output_model_path, dictionary=dictionary, temp_directory=data_directory, num_jobs=args.num_jobs) trainer.train()
def test_generate_orthography_dict(basic_corpus_dir, orth_sick_output, temp_dir): if G2P_DISABLED: pytest.skip('No Pynini found') command = [ 'g2p', basic_corpus_dir, orth_sick_output, '-t', temp_dir, '-q', '--clean', '-d' ] args, unknown = parser.parse_known_args(command) run_g2p(args) assert os.path.exists(orth_sick_output) d = Dictionary(orth_sick_output, temp_dir) assert len(d.words) > 0
def test_generate_orthography_dict(basic_corpus_dir, orth_sick_output, temp_dir): if G2P_DISABLED: pytest.skip('No Pynini found') args = G2PDummyArgs() args.g2p_model_path = None args.input_path = basic_corpus_dir args.output_path = orth_sick_output args.temp_directory = temp_dir run_g2p(args) assert os.path.exists(orth_sick_output) d = Dictionary(orth_sick_output, temp_dir) assert len(d.words) > 0
def validate_corpus(args): command = 'validate' all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) shutil.rmtree(data_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) logger = setup_logger(command, data_directory) corpus = AlignableCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=getattr(args, 'num_jobs', 3), logger=logger, use_mp=not args.disable_mp) dictionary = Dictionary(args.dictionary_path, data_directory, logger=logger) if args.acoustic_model_path: acoustic_model = AcousticModel(args.acoustic_model_path) acoustic_model.validate(dictionary) a = CorpusValidator(corpus, dictionary, temp_directory=data_directory, ignore_acoustics=getattr(args, 'ignore_acoustics', False), test_transcriptions=getattr(args, 'test_transcriptions', False), use_mp=not args.disable_mp, logger=logger) begin = time.time() a.validate() logger.debug('Validation took {} seconds'.format(time.time() - begin)) logger.info('All done!') logger.debug('Done! Everything took {} seconds'.format(time.time() - all_begin)) handlers = logger.handlers[:] for handler in handlers: handler.close() logger.removeHandler(handler)
def train_g2p(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) dictionary = Dictionary(args.dictionary_path, '') t = Trainer(dictionary, args.output_model_path, temp_directory=temp_dir, order=args.order, num_jobs=args.num_jobs, use_mp=not args.disable_mp) if args.validate: t.validate() t.train()
def train_lm(args): command = 'train_lm' all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) if args.config_path: train_config = train_lm_yaml_to_config(args.config_path) else: train_config = load_basic_train_lm() corpus_name = os.path.basename(args.source_path) if corpus_name == '': args.source_path = os.path.dirname(args.source_path) corpus_name = os.path.basename(args.source_path) source = args.source_path dictionary = None if args.source_path.lower().endswith('.arpa'): corpus_name = os.path.splitext(corpus_name)[0] data_directory = os.path.join(temp_dir, corpus_name) else: data_directory = os.path.join(temp_dir, corpus_name) logger = setup_logger(command, data_directory) if not args.source_path.lower().endswith('.arpa'): source = AlignableCorpus(args.source_path, data_directory, num_jobs=args.num_jobs, use_mp=args.num_jobs>1) if args.dictionary_path is not None: dictionary = Dictionary(args.dictionary_path, data_directory) else: dictionary = None trainer = LmTrainer(source, train_config, args.output_model_path, dictionary=dictionary, temp_directory=data_directory, supplemental_model_path=args.model_path, supplemental_model_weight=args.model_weight) begin = time.time() trainer.train() logger.debug('Training took {} seconds'.format(time.time() - begin)) logger.info('All done!') logger.debug('Done! Everything took {} seconds'.format(time.time() - all_begin)) handlers = logger.handlers[:] for handler in handlers: handler.close() logger.removeHandler(handler)
def train_lm(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) all_begin = time.time() corpus_name = os.path.basename(args.source_path) if corpus_name == '': args.source_path = os.path.dirname(args.source_path) corpus_name = os.path.basename(args.source_path) if args.source_path.lower().endswith('.arpa'): source = args.source_path dictionary = None corpus_name = os.path.splitext(corpus_name)[0] data_directory = os.path.join(temp_dir, corpus_name) else: data_directory = os.path.join(temp_dir, corpus_name) source = AlignableCorpus(args.source_path, data_directory, num_jobs=args.num_jobs) if args.dictionary_path is not None: dictionary = Dictionary(args.dictionary_path, data_directory) else: dictionary = None if args.config_path: train_config = train_lm_yaml_to_config(args.config_path) else: train_config = load_basic_train_lm() trainer = LmTrainer(source, train_config, args.output_model_path, dictionary=dictionary, temp_directory=data_directory, supplemental_model_path=args.model_path, supplemental_model_weight=args.model_weight) trainer.train() print('Done! Everything took {} seconds'.format(time.time() - all_begin))
def test_speaker_groupings(large_prosodylab_format_directory, temp_dir, large_dataset_dictionary, default_feature_config): output_directory = os.path.join(temp_dir, 'large') shutil.rmtree(output_directory, ignore_errors=True) d = Dictionary(large_dataset_dictionary, output_directory) d.write() c = AlignableCorpus(large_prosodylab_format_directory, output_directory) c.initialize_corpus(d) default_feature_config.generate_features(c) speakers = os.listdir(large_prosodylab_format_directory) for s in speakers: assert any(s in x for x in c.speaker_groups) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert any(name in x for x in c.groups) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert any(name in x for x in c.feat_mapping) shutil.rmtree(output_directory, ignore_errors=True) d.write() c = AlignableCorpus(large_prosodylab_format_directory, output_directory, num_jobs=2) c.initialize_corpus(d) default_feature_config.generate_features(c) for s in speakers: assert any(s in x for x in c.speaker_groups) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert any(name in x for x in c.groups) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert any(name in x for x in c.feat_mapping)
def test_extra_annotations(extra_annotations_path, generated_dir): d = Dictionary(extra_annotations_path, os.path.join(generated_dir, 'extra')) assert '{' in d.graphemes d.write()
def test_basic_noposition(basic_dict_path, generated_dir): d = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic'), position_dependent_phones=False) x = d.write() assert set(d.phones) == {'sil', 'sp', 'spn', 'phonea', 'phoneb', 'phonec'}
def align_corpus(args): all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) conf_path = os.path.join(data_directory, 'config.yml') if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f, Loader=yaml.SafeLoader) else: conf = {'dirty': False, 'begin': time.time(), 'version': __version__, 'type': 'align', 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path} if getattr(args, 'clean', False) \ or conf['dirty'] or conf['type'] != 'align' \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path: shutil.rmtree(data_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) try: corpus = AlignableCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=args.num_jobs) if corpus.issues_check: print('WARNING: Some issues parsing the corpus were detected. ' 'Please run the validator to get more information.') print(corpus.speaker_utterance_info()) acoustic_model = AcousticModel(args.acoustic_model_path) dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) acoustic_model.validate(dictionary) begin = time.time() if args.config_path: align_config = align_yaml_to_config(args.config_path) else: align_config = load_basic_align() a = PretrainedAligner(corpus, dictionary, acoustic_model, align_config, temp_directory=data_directory, debug=getattr(args, 'debug', False)) if args.debug: print('Setup pretrained aligner in {} seconds'.format(time.time() - begin)) a.verbose = args.verbose begin = time.time() a.align() if args.debug: print('Performed alignment in {} seconds'.format(time.time() - begin)) begin = time.time() a.export_textgrids(args.output_directory) if args.debug: print('Exported TextGrids in {} seconds'.format(time.time() - begin)) print('Done! Everything took {} seconds'.format(time.time() - all_begin)) except Exception as _: conf['dirty'] = True raise finally: with open(conf_path, 'w') as f: yaml.dump(conf, f)
def transcribe_corpus(args): command = 'transcribe' all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) if args.config_path: transcribe_config = transcribe_yaml_to_config(args.config_path) else: transcribe_config = load_basic_transcribe() data_directory = os.path.join(temp_dir, corpus_name) if getattr(args, 'clean', False) and os.path.exists(data_directory): print('Cleaning old directory!') shutil.rmtree(data_directory, ignore_errors=True) logger = setup_logger(command, data_directory) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) os.makedirs(data_directory, exist_ok=True) conf_path = os.path.join(data_directory, 'config.yml') if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f, Loader=yaml.SafeLoader) else: conf = { 'dirty': False, 'begin': time.time(), 'version': __version__, 'type': 'transcribe', 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path, 'acoustic_model_path': args.acoustic_model_path, 'language_model_path': args.language_model_path, } if conf['dirty'] or conf['type'] != command \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path \ or conf['language_model_path'] != args.language_model_path \ or conf['acoustic_model_path'] != args.acoustic_model_path: logger.warning( 'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no ' 'weird behavior for previous versions of the temporary directory.') if conf['dirty']: logger.debug('Previous run ended in an error (maybe ctrl-c?)') if conf['type'] != command: logger.debug( 'Previous run was a different subcommand than {} (was {})'. format(command, conf['type'])) if conf['corpus_directory'] != args.corpus_directory: logger.debug('Previous run used source directory ' 'path {} (new run: {})'.format( conf['corpus_directory'], args.corpus_directory)) if conf['version'] != __version__: logger.debug('Previous run was on {} version (new run: {})'.format( conf['version'], __version__)) if conf['dictionary_path'] != args.dictionary_path: logger.debug('Previous run used dictionary path {} ' '(new run: {})'.format(conf['dictionary_path'], args.dictionary_path)) if conf['acoustic_model_path'] != args.acoustic_model_path: logger.debug('Previous run used acoustic model path {} ' '(new run: {})'.format(conf['acoustic_model_path'], args.acoustic_model_path)) if conf['language_model_path'] != args.language_model_path: logger.debug('Previous run used language model path {} ' '(new run: {})'.format(conf['language_model_path'], args.language_model_path)) try: if args.evaluate: corpus = AlignableCorpus( args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=args.num_jobs, use_mp=transcribe_config.use_mp) else: corpus = TranscribeCorpus( args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=args.num_jobs, use_mp=transcribe_config.use_mp) print(corpus.speaker_utterance_info()) acoustic_model = AcousticModel(args.acoustic_model_path, root_directory=data_directory) language_model = LanguageModel(args.language_model_path, root_directory=data_directory) dictionary = Dictionary(args.dictionary_path, data_directory) acoustic_model.validate(dictionary) begin = time.time() t = Transcriber(corpus, dictionary, acoustic_model, language_model, transcribe_config, temp_directory=data_directory, debug=getattr(args, 'debug', False), evaluation_mode=args.evaluate) if args.debug: print('Setup pretrained aligner in {} seconds'.format(time.time() - begin)) begin = time.time() t.transcribe() if args.debug: print('Performed transcribing in {} seconds'.format(time.time() - begin)) if args.evaluate: t.evaluate(args.output_directory) best_config_path = os.path.join(args.output_directory, 'best_transcribe_config.yaml') save_config(t.transcribe_config, best_config_path) t.export_transcriptions(args.output_directory) else: begin = time.time() t.export_transcriptions(args.output_directory) if args.debug: print('Exported transcriptions in {} seconds'.format( time.time() - begin)) print('Done! Everything took {} seconds'.format(time.time() - all_begin)) except Exception as _: conf['dirty'] = True raise finally: handlers = logger.handlers[:] for handler in handlers: handler.close() logger.removeHandler(handler) if os.path.exists(data_directory): with open(conf_path, 'w') as f: yaml.dump(conf, f)
def train_ivector(args): command = 'train_ivector' all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) if args.config_path: train_config, align_config = train_yaml_to_config(args.config_path) else: train_config, align_config = load_basic_train_ivector() conf_path = os.path.join(data_directory, 'config.yml') if getattr(args, 'clean', False) and os.path.exists(data_directory): print('Cleaning old directory!') shutil.rmtree(data_directory, ignore_errors=True) logger = setup_logger(command, data_directory) if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f, Loader=yaml.SafeLoader) else: conf = { 'dirty': False, 'begin': all_begin, 'version': __version__, 'type': command, 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path, 'acoustic_model_path': args.acoustic_model_path, } if conf['dirty'] or conf['type'] != command \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path \ or conf['acoustic_model_path'] != args.acoustic_model_path: logger.warning( 'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no ' 'weird behavior for previous versions of the temporary directory.') if conf['dirty']: logger.debug('Previous run ended in an error (maybe ctrl-c?)') if conf['type'] != command: logger.debug( 'Previous run was a different subcommand than {} (was {})'. format(command, conf['type'])) if conf['corpus_directory'] != args.corpus_directory: logger.debug('Previous run used source directory ' 'path {} (new run: {})'.format( conf['corpus_directory'], args.corpus_directory)) if conf['version'] != __version__: logger.debug('Previous run was on {} version (new run: {})'.format( conf['version'], __version__)) if conf['dictionary_path'] != args.dictionary_path: logger.debug('Previous run used dictionary path {} ' '(new run: {})'.format(conf['dictionary_path'], args.dictionary_path)) if conf['acoustic_model_path'] != args.acoustic_model_path: logger.debug('Previous run used acoustic model path {} ' '(new run: {})'.format(conf['acoustic_model_path'], args.acoustic_model_path)) os.makedirs(data_directory, exist_ok=True) try: begin = time.time() corpus = AlignableCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=args.num_jobs, debug=getattr(args, 'debug', False), logger=logger, use_mp=align_config.use_mp) acoustic_model = AcousticModel(args.acoustic_model_path) dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set, logger=logger) acoustic_model.validate(dictionary) a = PretrainedAligner(corpus, dictionary, acoustic_model, align_config, temp_directory=data_directory, logger=logger) logger.debug( 'Setup pretrained aligner in {} seconds'.format(time.time() - begin)) a.verbose = args.verbose begin = time.time() a.align() logger.debug('Performed alignment in {} seconds'.format(time.time() - begin)) for identifier, trainer in train_config.items(): trainer.logger = logger if identifier != 'ivector': continue begin = time.time() trainer.init_training(identifier, data_directory, corpus, dictionary, a) trainer.train(call_back=print) logger.debug('Training took {} seconds'.format(time.time() - begin)) trainer.save(args.output_model_path) logger.info('All done!') logger.debug('Done! Everything took {} seconds'.format(time.time() - all_begin)) except Exception as e: conf['dirty'] = True raise e finally: handlers = logger.handlers[:] for handler in handlers: handler.close() logger.removeHandler(handler) with open(conf_path, 'w') as f: yaml.dump(conf, f)
def train_ivector(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) conf_path = os.path.join(data_directory, 'config.yml') if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f, Loader=yaml.SafeLoader) else: conf = { 'dirty': False, 'begin': time.time(), 'version': __version__, 'type': 'train_and_align', 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path } if getattr(args, 'clean', False) \ or conf['dirty'] or conf['type'] != 'train_and_align' \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path: shutil.rmtree(data_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) try: corpus = AlignableCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=getattr(args, 'num_jobs', 3), debug=getattr(args, 'debug', False)) if corpus.issues_check: print('WARNING: Some issues parsing the corpus were detected. ' 'Please run the validator to get more information.') dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) utt_oov_path = os.path.join(corpus.split_directory(), 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, args.output_directory) oov_path = os.path.join(corpus.split_directory(), 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, args.output_directory) if args.config_path: train_config, align_config = train_yaml_to_config(args.config_path) else: train_config, align_config = load_basic_train_ivector() a = TrainableAligner(corpus, dictionary, train_config, align_config, temp_directory=data_directory) a.verbose = args.verbose a.train() a.save(args.output_model_path) except Exception as e: conf['dirty'] = True raise e finally: with open(conf_path, 'w') as f: yaml.dump(conf, f)
def train_dictionary(args): command = 'train_dictionary' all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) conf_path = os.path.join(data_directory, 'config.yml') if args.config_path: align_config = align_yaml_to_config(args.config_path) else: align_config = load_basic_align() if getattr(args, 'clean', False) and os.path.exists(data_directory): print('Cleaning old directory!') shutil.rmtree(data_directory, ignore_errors=True) logger = setup_logger(command, data_directory) if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f, Loader=yaml.SafeLoader) else: conf = {'dirty': False, 'begin': time.time(), 'version': __version__, 'type': command, 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path, 'acoustic_model_path': args.acoustic_model_path } if conf['dirty'] or conf['type'] != command \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path: logger.warning( 'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no ' 'weird behavior for previous versions of the temporary directory.') if conf['dirty']: logger.debug('Previous run ended in an error (maybe ctrl-c?)') if conf['type'] != command: logger.debug('Previous run was a different subcommand than {} (was {})'.format(command, conf['type'])) if conf['corpus_directory'] != args.corpus_directory: logger.debug('Previous run used source directory ' 'path {} (new run: {})'.format(conf['corpus_directory'], args.corpus_directory)) if conf['version'] != __version__: logger.debug('Previous run was on {} version (new run: {})'.format(conf['version'], __version__)) if conf['dictionary_path'] != args.dictionary_path: logger.debug('Previous run used dictionary path {} ' '(new run: {})'.format(conf['dictionary_path'], args.dictionary_path)) if conf['acoustic_model_path'] != args.acoustic_model_path: logger.debug('Previous run used acoustic model path {} ' '(new run: {})'.format(conf['acoustic_model_path'], args.acoustic_model_path)) os.makedirs(data_directory, exist_ok=True) try: corpus = AlignableCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=args.num_jobs, use_mp=align_config.use_mp, logger=logger) if corpus.issues_check: logger.warning('WARNING: Some issues parsing the corpus were detected. ' 'Please run the validator to get more information.') logger.info(corpus.speaker_utterance_info()) acoustic_model = AcousticModel(args.acoustic_model_path) dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set, logger=logger) acoustic_model.validate(dictionary) begin = time.time() a = PretrainedAligner(corpus, dictionary, acoustic_model, align_config, temp_directory=data_directory, debug=getattr(args, 'debug', False), logger=logger) logger.debug('Setup pretrained aligner in {} seconds'.format(time.time() - begin)) a.verbose = args.verbose begin = time.time() a.align() logger.debug('Performed alignment in {} seconds'.format(time.time() - begin)) a.generate_pronunciations(args.output_directory) print('Done! Everything took {} seconds'.format(time.time() - all_begin)) except Exception as _: conf['dirty'] = True raise finally: with open(conf_path, 'w') as f: yaml.dump(conf, f)
def test_frclitics(frclitics_dict_path, generated_dir): d = Dictionary(frclitics_dict_path, os.path.join(generated_dir, 'frclitics')) x = d.write() assert d.separate_clitics('aujourd') == ['aujourd'] assert d.separate_clitics('aujourd\'hui') == ['aujourd\'hui'] assert d.separate_clitics('vingt-six') == ['vingt', 'six'] assert d.separate_clitics('m\'appelle') == ['m\'', 'appelle'] assert d.separate_clitics('c\'est') == ['c\'est'] assert d.separate_clitics('purple-people-eater') == ['purple-people-eater'] assert d.separate_clitics('m\'appele') == ['m\'', 'appele'] assert d.separate_clitics('m\'ving-sic') == ["m'", 'ving', 'sic'] assert d.separate_clitics('flying\'purple-people-eater') == [ 'flying\'purple-people-eater' ]
def transcribe_corpus(args): all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) print(data_directory, os.path.exists(data_directory)) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) os.makedirs(data_directory, exist_ok=True) conf_path = os.path.join(data_directory, 'config.yml') if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f, Loader=yaml.SafeLoader) else: conf = { 'dirty': False, 'begin': time.time(), 'version': __version__, 'type': 'align', 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path } if getattr(args, 'clean', False) \ or conf['dirty'] or conf['type'] != 'align' \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path: shutil.rmtree(data_directory, ignore_errors=True) try: corpus = TranscribeCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=args.num_jobs) print(corpus.speaker_utterance_info()) acoustic_model = AcousticModel(args.acoustic_model_path) language_model = LanguageModel(args.language_model_path) dictionary = Dictionary(args.dictionary_path, data_directory) acoustic_model.validate(dictionary) begin = time.time() t = Transcriber(corpus, dictionary, acoustic_model, language_model, temp_directory=data_directory, debug=getattr(args, 'debug', False)) if args.debug: print('Setup pretrained aligner in {} seconds'.format(time.time() - begin)) begin = time.time() a.align() if args.debug: print('Performed alignment in {} seconds'.format(time.time() - begin)) begin = time.time() a.export_textgrids(args.output_directory) if args.debug: print('Exported TextGrids in {} seconds'.format(time.time() - begin)) print('Done! Everything took {} seconds'.format(time.time() - all_begin)) except Exception as _: conf['dirty'] = True raise finally: if os.path.exists(data_directory): with open(conf_path, 'w') as f: yaml.dump(conf, f)
def sick_dict(sick_dict_path, generated_dir): output_directory = os.path.join(generated_dir, 'sickcorpus') dictionary = Dictionary(sick_dict_path, output_directory) dictionary.write() return dictionary
def align_corpus(args, unknown_args=None): command = 'train_and_align' all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) logger = setup_logger(command, data_directory) if args.config_path: train_config, align_config = train_yaml_to_config(args.config_path) else: train_config, align_config = load_basic_train() if unknown_args: align_config.update_from_args(unknown_args) conf_path = os.path.join(data_directory, 'config.yml') if args.debug: logger.warning( 'Running in DEBUG mode, may have impact on performance and disk usage.' ) if getattr(args, 'clean', False) and os.path.exists(data_directory): logger.info('Cleaning old directory!') shutil.rmtree(data_directory, ignore_errors=True) if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f, Loader=yaml.SafeLoader) else: conf = { 'dirty': False, 'begin': time.time(), 'version': __version__, 'type': command, 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path } if conf['dirty'] or conf['type'] != command \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path: logger.warning( 'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no ' 'weird behavior for previous versions of the temporary directory.') if conf['dirty']: logger.debug('Previous run ended in an error (maybe ctrl-c?)') if conf['type'] != command: logger.debug( 'Previous run was a different subcommand than {} (was {})'. format(command, conf['type'])) if conf['corpus_directory'] != args.corpus_directory: logger.debug('Previous run used source directory ' 'path {} (new run: {})'.format( conf['corpus_directory'], args.corpus_directory)) if conf['version'] != __version__: logger.debug('Previous run was on {} version (new run: {})'.format( conf['version'], __version__)) if conf['dictionary_path'] != args.dictionary_path: logger.debug('Previous run used dictionary path {} ' '(new run: {})'.format(conf['dictionary_path'], args.dictionary_path)) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) try: corpus = AlignableCorpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=getattr(args, 'num_jobs', 3), debug=getattr(args, 'debug', False), logger=logger, use_mp=align_config.use_mp) if corpus.issues_check: logger.warning('Some issues parsing the corpus were detected. ' 'Please run the validator to get more information.') logger.info(corpus.speaker_utterance_info()) dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set, logger=logger) utt_oov_path = os.path.join(corpus.split_directory(), 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, args.output_directory) oov_path = os.path.join(corpus.split_directory(), 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, args.output_directory) a = TrainableAligner(corpus, dictionary, train_config, align_config, temp_directory=data_directory, logger=logger, debug=getattr(args, 'debug', False)) a.verbose = args.verbose begin = time.time() a.train() logger.debug('Training took {} seconds'.format(time.time() - begin)) a.export_textgrids(args.output_directory) if args.output_model_path is not None: a.save(args.output_model_path) logger.info('All done!') logger.debug('Done! Everything took {} seconds'.format(time.time() - all_begin)) except Exception as _: conf['dirty'] = True raise finally: handlers = logger.handlers[:] for handler in handlers: handler.close() logger.removeHandler(handler) with open(conf_path, 'w') as f: yaml.dump(conf, f)