def generate_speech_and_text_corpora(data_dir, wav16_dir, debug, sequitur_model_path, lexicon_file_name, audio_corpora, prompt_words): logging.info("loading lexicon...") lex = Lexicon(file_name=lexicon_file_name) logging.info("loading lexicon...done.") logging.info("loading transcripts...") if sequitur_model_path: add_all = True else: add_all = False ts_all = {} ts_train = {} ts_test = {} transcript_objs = [] for audio_corpus in audio_corpora: transcripts = Transcripts(corpus_name=audio_corpus) ts_all_, ts_train_, ts_test_ = transcripts.split(limit=debug, add_all=add_all) logging.info("loading transcripts from %s (%d train, %d test) ..." % (audio_corpus, len(ts_train_), len(ts_test_))) ts_all.update(ts_all_) ts_train.update(ts_train_) ts_test.update(ts_test_) transcript_objs.append(transcripts) logging.info("loading transcripts (%d train, %d test) ...done." % ( len(ts_train), len(ts_test))) export_kaldi_data(wav16_dir, audio_corpora, '%s/train/' % data_dir, ts_train) export_kaldi_data(wav16_dir, audio_corpora, '%s/test/' % data_dir, ts_test) if sequitur_model_path: for transcript_obj in transcript_objs: lex = add_missing_words(transcript_obj, lex, sequitur_model_path) ps, utt_dict = export_dictionary(ts_all, lex, '%s/local/dict/lexicon.txt' % data_dir, prompt_words) write_nonsilence_phones( ps, '%s/local/dict/nonsilence_phones.txt' % data_dir) write_silence_phones('%s/local/dict/silence_phones.txt' % data_dir) write_optional_silence('%s/local/dict/optional_silence.txt' % data_dir) write_extra_questions(ps, '%s/local/dict/extra_questions.txt' % data_dir) create_training_data_for_language_model(transcript_objs, utt_dict, data_dir)
data_dir = "%s/data" % work_dir mfcc_dir = "%s/mfcc" % work_dir wav16_dir = config.get("speech", "wav16_dir_de") # # load lexicon, transcripts # print "loading lexicon..." lex = Lexicon() print "loading lexicon...done." print "loading transcripts..." transcripts = Transcripts() ts_all, ts_train, ts_test = transcripts.split(limit=DEBUG_LIMIT, add_all=add_all) print "loading transcripts (%d train, %d test) ...done." % (len(ts_train), len(ts_test)) # # create work_dir structure # utils.mkdirs('%s/lexicon' % data_dir) utils.mkdirs('%s/local/dict' % data_dir) utils.mkdirs(wav16_dir) utils.mkdirs(mfcc_dir) utils.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir) utils.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir) #
# config = misc.load_config('.speechrc') wav16_dir = config.get("speech", "wav16") # # load transcripts # logging.info("loading transcripts...") transcripts = Transcripts(corpus_name=corpus_name) logging.info("loading transcripts...done.") logging.info("splitting transcripts...") ts_all, ts_train, ts_test = transcripts.split() logging.info("splitting transcripts done, %d train, %d test." % (len(ts_train), len(ts_test))) # # audio stats # def format_duration(duration): m, s = divmod(duration, 60) h, m = divmod(m, 60) return "%3d:%02d:%02d" % (h, m, s) def ts_stats(ts_data, ts_name):
# logging.info("loading lexicon...") lex = Lexicon(file_name=dict_name) logging.info("loading lexicon...done.") logging.info("loading transcripts...") ts_all = {} ts_train = {} ts_test = {} transcripts = {} cfn2corpus = {} for audio_corpus in audio_corpora: transcripts_ = Transcripts(corpus_name=audio_corpus) ts_all_, ts_train_, ts_test_ = transcripts_.split(limit=options.debug) logging.info("loading transcripts from %s (%d train, %d test) ..." % (audio_corpus, len(ts_train_), len(ts_test_))) ts_all.update(ts_all_) ts_train.update(ts_train_) ts_test.update(ts_test_) transcripts.update(transcripts_) for cfn in transcripts_: cfn2corpus[cfn] = audio_corpus logging.info("loading transcripts (%d train, %d test) ...done." % (len(ts_train), len(ts_test)))
add_all = True else: add_all = False ts_all = {} ts_train = {} ts_test = {} transcript_objs = [] for audio_corpus in audio_corpora: logging.info("loading transcripts from %s ..." % audio_corpus) transcripts = Transcripts(corpus_name=audio_corpus) ts_all_, ts_train_, ts_test_ = transcripts.split(limit=options.debug, add_all=add_all, lang=options.lang) ts_all.update(ts_all_) ts_train.update(ts_train_) ts_test.update(ts_test_) transcript_objs.append(transcripts) logging.info("loading transcripts from %s: %d train, %d test samples." % (audio_corpus, len(ts_train_), len(ts_test_))) logging.info("loading transcripts done, total: %d train, %d test samples." % (len(ts_train), len(ts_test))) export_kaldi_data(wav16_dir, audio_corpora, '%s/train/' % data_dir, ts_train) export_kaldi_data(wav16_dir, audio_corpora, '%s/test/' % data_dir, ts_test)
data_dir = "%s/data" % work_dir mfcc_dir = "%s/mfcc" % work_dir wav16_dir = config.get("speech", "wav16_dir_%s" % options.lang) # # load lexicon, transcripts # logging.info ( "loading lexicon...") lex = Lexicon(lang=options.lang) logging.info ( "loading lexicon...done.") logging.info ( "loading transcripts...") transcripts = Transcripts(lang=options.lang) ts_all, ts_train, ts_test = transcripts.split(limit=options.debug, add_all=options.add_all) logging.info ( "loading transcripts (%d train, %d test) ...done." % (len(ts_train), len(ts_test))) # # create work_dir structure # misc.mkdirs('%s/lexicon' % data_dir) misc.mkdirs('%s/local/dict' % data_dir) misc.mkdirs(wav16_dir) misc.mkdirs(mfcc_dir) misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir) misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)
# os.system(cmd) # utt2spkf.write('%s %s\n' % (utt_id, ts['spk'])) utt_num[train_val] = utt_num[train_val] + 1 cnt += 1 lcnt += 1 if cnt % 1000 == 0: logging.info( '%6d audio files linked from %s [%s] (%6d/%6d)...' % (cnt, ts['corpus_name'], train_val, lcnt, len(tsdict))) utt_num = {'all': 0} for audio_corpus in audio_corpora: logging.info("exporting transcripts from %s ..." % audio_corpus) transcripts = Transcripts(corpus_name=audio_corpus) ts_all = transcripts.split() export_audio('all', ts_all) logging.info("exported transcripts from %s: %d samples." % (audio_corpus, len(ts_all))) logging.info("All done.")