def generate_speech_and_text_corpora(data_dir,
                                     wav16_dir,
                                     debug,
                                     sequitur_model_path,
                                     lexicon_file_name,
                                     audio_corpora,
                                     prompt_words):
    logging.info("loading lexicon...")
    lex = Lexicon(file_name=lexicon_file_name)
    logging.info("loading lexicon...done.")
    logging.info("loading transcripts...")

    if sequitur_model_path:
        add_all = True
    else:
        add_all = False

    ts_all = {}
    ts_train = {}
    ts_test = {}
    transcript_objs = []
    for audio_corpus in audio_corpora:
        transcripts = Transcripts(corpus_name=audio_corpus)

        ts_all_, ts_train_, ts_test_ = transcripts.split(limit=debug, add_all=add_all)

        logging.info("loading transcripts from %s (%d train, %d test) ..." % (audio_corpus, len(ts_train_), len(ts_test_)))

        ts_all.update(ts_all_)
        ts_train.update(ts_train_)
        ts_test.update(ts_test_)
        transcript_objs.append(transcripts)

    logging.info("loading transcripts (%d train, %d test) ...done." % (
        len(ts_train), len(ts_test)))

    export_kaldi_data(wav16_dir, audio_corpora, '%s/train/' % data_dir, ts_train)
    export_kaldi_data(wav16_dir, audio_corpora, '%s/test/' % data_dir, ts_test)

    if sequitur_model_path:
        for transcript_obj in transcript_objs:
            lex = add_missing_words(transcript_obj, lex, sequitur_model_path)

    ps, utt_dict = export_dictionary(ts_all,
                                     lex,
                                     '%s/local/dict/lexicon.txt' % data_dir,
                                     prompt_words)
    write_nonsilence_phones(
        ps, '%s/local/dict/nonsilence_phones.txt' % data_dir)

    write_silence_phones('%s/local/dict/silence_phones.txt' % data_dir)
    write_optional_silence('%s/local/dict/optional_silence.txt' % data_dir)
    write_extra_questions(ps, '%s/local/dict/extra_questions.txt' % data_dir)
    create_training_data_for_language_model(transcript_objs, utt_dict, data_dir)
Пример #2
0
data_dir = "%s/data" % work_dir
mfcc_dir = "%s/mfcc" % work_dir

wav16_dir = config.get("speech", "wav16_dir_de")

#
# load lexicon, transcripts
#

print "loading lexicon..."
lex = Lexicon()
print "loading lexicon...done."

print "loading transcripts..."
transcripts = Transcripts()
ts_all, ts_train, ts_test = transcripts.split(limit=DEBUG_LIMIT,
                                              add_all=add_all)
print "loading transcripts (%d train, %d test) ...done." % (len(ts_train),
                                                            len(ts_test))
#
# create work_dir structure
#

utils.mkdirs('%s/lexicon' % data_dir)
utils.mkdirs('%s/local/dict' % data_dir)
utils.mkdirs(wav16_dir)
utils.mkdirs(mfcc_dir)

utils.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir)
utils.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)

#
Пример #3
0
#

config = misc.load_config('.speechrc')

wav16_dir = config.get("speech", "wav16")

#
# load transcripts
#

logging.info("loading transcripts...")
transcripts = Transcripts(corpus_name=corpus_name)
logging.info("loading transcripts...done.")

logging.info("splitting transcripts...")
ts_all, ts_train, ts_test = transcripts.split()
logging.info("splitting transcripts done, %d train, %d test." %
             (len(ts_train), len(ts_test)))

#
# audio stats
#


def format_duration(duration):
    m, s = divmod(duration, 60)
    h, m = divmod(m, 60)
    return "%3d:%02d:%02d" % (h, m, s)


def ts_stats(ts_data, ts_name):
#

logging.info("loading lexicon...")
lex = Lexicon(file_name=dict_name)
logging.info("loading lexicon...done.")

logging.info("loading transcripts...")
ts_all = {}
ts_train = {}
ts_test = {}
transcripts = {}
cfn2corpus = {}
for audio_corpus in audio_corpora:
    transcripts_ = Transcripts(corpus_name=audio_corpus)

    ts_all_, ts_train_, ts_test_ = transcripts_.split(limit=options.debug)

    logging.info("loading transcripts from %s (%d train, %d test) ..." %
                 (audio_corpus, len(ts_train_), len(ts_test_)))

    ts_all.update(ts_all_)
    ts_train.update(ts_train_)
    ts_test.update(ts_test_)
    transcripts.update(transcripts_)

    for cfn in transcripts_:
        cfn2corpus[cfn] = audio_corpus

logging.info("loading transcripts (%d train, %d test) ...done." %
             (len(ts_train), len(ts_test)))
Пример #5
0
    add_all = True
else:
    add_all = False

ts_all = {}
ts_train = {}
ts_test = {}
transcript_objs = []
for audio_corpus in audio_corpora:

    logging.info("loading transcripts from %s ..." % audio_corpus)

    transcripts = Transcripts(corpus_name=audio_corpus)

    ts_all_, ts_train_, ts_test_ = transcripts.split(limit=options.debug,
                                                     add_all=add_all,
                                                     lang=options.lang)

    ts_all.update(ts_all_)
    ts_train.update(ts_train_)
    ts_test.update(ts_test_)
    transcript_objs.append(transcripts)

    logging.info("loading transcripts from %s: %d train, %d test samples." %
                 (audio_corpus, len(ts_train_), len(ts_test_)))

logging.info("loading transcripts done, total: %d train, %d test samples." %
             (len(ts_train), len(ts_test)))

export_kaldi_data(wav16_dir, audio_corpora, '%s/train/' % data_dir, ts_train)
export_kaldi_data(wav16_dir, audio_corpora, '%s/test/' % data_dir, ts_test)
Пример #6
0
data_dir    = "%s/data" % work_dir
mfcc_dir    = "%s/mfcc" % work_dir

wav16_dir   = config.get("speech", "wav16_dir_%s" % options.lang)

#
# load lexicon, transcripts
#

logging.info ( "loading lexicon...")
lex = Lexicon(lang=options.lang)
logging.info ( "loading lexicon...done.")

logging.info ( "loading transcripts...")
transcripts = Transcripts(lang=options.lang)
ts_all, ts_train, ts_test = transcripts.split(limit=options.debug, add_all=options.add_all)
logging.info ( "loading transcripts (%d train, %d test) ...done." % (len(ts_train), len(ts_test)))

#
# create work_dir structure
#


misc.mkdirs('%s/lexicon' % data_dir)
misc.mkdirs('%s/local/dict' % data_dir)
misc.mkdirs(wav16_dir)
misc.mkdirs(mfcc_dir)

misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir)
misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)
            # os.system(cmd)

            # utt2spkf.write('%s %s\n' % (utt_id, ts['spk']))

            utt_num[train_val] = utt_num[train_val] + 1

        cnt += 1
        lcnt += 1
        if cnt % 1000 == 0:
            logging.info(
                '%6d audio files linked from %s [%s] (%6d/%6d)...' %
                (cnt, ts['corpus_name'], train_val, lcnt, len(tsdict)))


utt_num = {'all': 0}

for audio_corpus in audio_corpora:

    logging.info("exporting transcripts from %s ..." % audio_corpus)

    transcripts = Transcripts(corpus_name=audio_corpus)

    ts_all = transcripts.split()

    export_audio('all', ts_all)

    logging.info("exported transcripts from %s: %d samples." %
                 (audio_corpus, len(ts_all)))

logging.info("All done.")