def generate_speech_and_text_corpora(data_dir,
                                     wav16_dir,
                                     debug,
                                     sequitur_model_path,
                                     lexicon_file_name,
                                     audio_corpora,
                                     prompt_words):
    logging.info("loading lexicon...")
    lex = Lexicon(file_name=lexicon_file_name)
    logging.info("loading lexicon...done.")
    logging.info("loading transcripts...")

    if sequitur_model_path:
        add_all = True
    else:
        add_all = False

    ts_all = {}
    ts_train = {}
    ts_test = {}
    transcript_objs = []
    for audio_corpus in audio_corpora:
        transcripts = Transcripts(corpus_name=audio_corpus)

        ts_all_, ts_train_, ts_test_ = transcripts.split(limit=debug, add_all=add_all)

        logging.info("loading transcripts from %s (%d train, %d test) ..." % (audio_corpus, len(ts_train_), len(ts_test_)))

        ts_all.update(ts_all_)
        ts_train.update(ts_train_)
        ts_test.update(ts_test_)
        transcript_objs.append(transcripts)

    logging.info("loading transcripts (%d train, %d test) ...done." % (
        len(ts_train), len(ts_test)))

    export_kaldi_data(wav16_dir, audio_corpora, '%s/train/' % data_dir, ts_train)
    export_kaldi_data(wav16_dir, audio_corpora, '%s/test/' % data_dir, ts_test)

    if sequitur_model_path:
        for transcript_obj in transcript_objs:
            lex = add_missing_words(transcript_obj, lex, sequitur_model_path)

    ps, utt_dict = export_dictionary(ts_all,
                                     lex,
                                     '%s/local/dict/lexicon.txt' % data_dir,
                                     prompt_words)
    write_nonsilence_phones(
        ps, '%s/local/dict/nonsilence_phones.txt' % data_dir)

    write_silence_phones('%s/local/dict/silence_phones.txt' % data_dir)
    write_optional_silence('%s/local/dict/optional_silence.txt' % data_dir)
    write_extra_questions(ps, '%s/local/dict/extra_questions.txt' % data_dir)
    create_training_data_for_language_model(transcript_objs, utt_dict, data_dir)
예제 #2
0
def proc_transcripts(corpus_name):

    global use_prompts, lang

    transcripts = Transcripts(corpus_name=corpus_name)

    if use_prompts:
        transcripts_set = set(
            (u' '.join(tokenize(transcripts[key]["prompt"], lang)))
            for key in transcripts)
    else:
        transcripts_set = set((transcripts[key]["ts"] for key in transcripts))

    for ts in transcripts_set:
        yield ts
예제 #3
0
def main(verbose=False, *speech_corpora):
    """Scan directory for audio files and convert them to wav files

    For each speech corpus `speech_corpus`

    1. the resulting wav files are written to the directory
       `.speechrc.wav16`/<speech_corpus>/

    2. the transcripts in data/src/speech/<speech_corpus>/transcripts_*.csv are
       updated.
    """
    misc.init_app('speech_audio_scan')

    config = misc.load_config('.speechrc')

    speech_corpora_dir = Path(config.get("speech", "speech_corpora"))
    wav16 = Path(config.get("speech", "wav16"))

    if len(speech_corpora) < 1:
        logging.error("At least one speech corpus must be provided.")
        sys.exit(1)

    if verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    exit_if_corpus_is_missing(speech_corpora_dir, speech_corpora)

    for speech_corpus in speech_corpora:
        transcripts = Transcripts(corpus_name=speech_corpus, create_db=True)
        out_wav16_subdir = wav16 / speech_corpus
        out_wav16_subdir.mkdir(parents=True, exist_ok=True)
        in_root_corpus_dir = speech_corpora_dir / speech_corpus

        scan_audiodir(str(in_root_corpus_dir), transcripts,
                      str(out_wav16_subdir))

        transcripts.save()
        print speech_corpus, "new transcripts saved."
        print
with codecs.open(lexfn, 'r', 'utf8') as lexf:
    for line in lexf:
        parts = line.strip().split(' ')
        lex[parts[0]] = ' '.join(parts[1:])

logging.info('reading lexicon %s ... done. %d entries.' % (lexfn, len(lex)))
# print repr(lex)

#
# export audio, prompts (for lm)
#

logging.info("exporting transcripts from %s ..." % audio_corpus)

transcripts = Transcripts(corpus_name=audio_corpus)

utt_num = 0

destdirfn = '%s/test' % data_dir

prompts = set()

for utt_id in transcripts:

    ts = transcripts[utt_id]
    prompts.add(u' '.join(tokenize(transcripts[utt_id]["prompt"],
                                   options.lang)))

    if ts['quality'] != 0:
        continue
예제 #5
0
else:
    logging.basicConfig(level=logging.INFO)

if len(args) != 1:
    parser.print_usage()
    sys.exit(1)

corpus_in = args[0]
corpus_out = corpus_in + '_phone'

#
# load transcripts
#

logging.info("loading transcripts...")
transcripts = Transcripts(corpus_name=corpus_in)
logging.info("loading transcripts...done.")

#
# config
#

config = misc.load_config('.speechrc')

corpora = config.get("speech", "speech_corpora")
wav16_dir = config.get("speech", "wav16")

out_dir = '%s/%s' % (corpora, corpus_out)
tmpfn_base = '/tmp/tmp16_%08x' % os.getpid()

if os.path.exists(out_dir):
예제 #6
0
data_dir = "%s/data" % work_dir
mfcc_dir = "%s/mfcc" % work_dir

wav16_dir = config.get("speech", "wav16_dir_de")

#
# load lexicon, transcripts
#

print "loading lexicon..."
lex = Lexicon()
print "loading lexicon...done."

print "loading transcripts..."
transcripts = Transcripts()
ts_all, ts_train, ts_test = transcripts.split(limit=DEBUG_LIMIT,
                                              add_all=add_all)
print "loading transcripts (%d train, %d test) ...done." % (len(ts_train),
                                                            len(ts_test))
#
# create work_dir structure
#

utils.mkdirs('%s/lexicon' % data_dir)
utils.mkdirs('%s/local/dict' % data_dir)
utils.mkdirs(wav16_dir)
utils.mkdirs(mfcc_dir)

utils.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir)
utils.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)
예제 #7
0
if len(args) < 1:
    parser.print_help()
    sys.exit(1)

if options.verbose:
    logging.basicConfig(level=logging.DEBUG)
else:
    logging.basicConfig(level=logging.INFO)

#
# load lexicon, transcripts
#

logging.info("loading transcripts...")
transcripts = Transcripts(lang=options.lang)
logging.info("loading transcripts...done.")

#
# main
#

cnt = 0

for csvfn in args:

    logging.info("applying results from %s ..." % csvfn)

    with open(csvfn, 'r') as csvf:

        for line in csvf:
예제 #8
0
    scan_dirs.append(config.get("speech", "librivoxdir"))

    wav16_dir = config.get("speech", "wav16_dir_en")

else:

    print "***ERROR: language %s not supported yet." % lang
    print
    sys.exit(1)

#
# load transcripts
#

print "loading transcripts..."
transcripts = Transcripts(lang=lang)
print "loading transcripts...done."


def audio_convert(cfn, subdir, fn, audiodir):

    # global mfcc_dir
    global wav16_dir

    # convert audio if not done yet

    w16filename = "%s/%s.wav" % (wav16_dir, cfn)

    if not os.path.isfile(w16filename):

        wavfilename = "%s/%s/wav/%s.wav" % (audiodir, subdir, fn)
예제 #9
0
    logging.basicConfig(level=logging.INFO)

#
# config
#

work_dir    = WORKDIR %options.lang 

wav16_dir   = config.get("speech", "wav16_dir_%s" % options.lang)

#
# load transcripts
#

logging.info ( "loading transcripts...")
transcripts = Transcripts(corpus_name=options.lang)
logging.info ( "loading transcripts...done. %d transcripts." % len(transcripts))
logging.info ("splitting transcripts...")
ts_all, ts_train, ts_test = transcripts.split()
logging.info ("splitting transcripts done, %d train, %d test." % (len(ts_train), len(ts_test)))

#
# create work_dir 
#

misc.mkdirs('%s' % work_dir)

# export csv files

csv_train_fn = '%s/train.csv' % work_dir
csv_dev_fn   = '%s/dev.csv'   % work_dir
예제 #10
0
def proc_transcripts(corpus_name):
    transcripts = Transcripts(corpus_name=corpus_name)
    transcripts_set = set((transcripts[key]["ts"] for key in transcripts))
    for ts in transcripts_set:
        yield ts
예제 #11
0
                  help="enable debug output")

(options, args) = parser.parse_args()

if options.verbose:
    logging.basicConfig(level=logging.DEBUG)
    logging.getLogger("requests").setLevel(logging.WARNING)
else:
    logging.basicConfig(level=logging.INFO)

#
# load transcripts
#

logging.info("loading transcripts...")
transcripts = Transcripts(lang=LANG)
logging.info("loading transcripts...done.")

#
# config
#

config = misc.load_config('.speechrc')

wav16_dir = config.get("speech", "wav16_dir_%s" % LANG)
noise_dir = config.get("speech", "noise_dir")

bg_dir = '%s/bg' % noise_dir
fg_dir = '%s/fg/16kHz' % noise_dir
out_dir = OUT_DIR
예제 #12
0
                  dest="verbose",
                  help="verbose output")

(options, args) = parser.parse_args()

if options.verbose:
    logging.basicConfig(level=logging.DEBUG)
else:
    logging.basicConfig(level=logging.INFO)

#
# load transcripts
#

logging.info('loading transcripts from %s ...' % AUDIO_CORPUS)
transcripts = Transcripts(corpus_name=AUDIO_CORPUS)
logging.info('loading transcripts from %s ... done.' % AUDIO_CORPUS)

#
# build set of accepted submissions
#

accept = set()
mics = set(['Kinect-RAW', 'Realtek', 'Samson', 'Yamaha', 'Kinect-Beam'])
for utt_id in transcripts:

    data = transcripts[utt_id]
    if data['quality'] < 2:
        continue

    parts = utt_id.split('-')
                      action="store_true",
                      dest="verbose",
                      help="verbose output")

    (options, speech_corpora) = parser.parse_args()

    if options.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    if len(speech_corpora) < 1:
        logging.error("At least one speech corpus must be provided.")
        sys.exit(1)

    exit_if_corpus_is_missing(speech_corpora_dir, speech_corpora)

    for speech_corpus in speech_corpora:
        transcripts = Transcripts(corpus_name=speech_corpus, create_db=True)
        out_wav16_subdir = '%s/%s' % (wav16, speech_corpus)
        misc.mkdirs(out_wav16_subdir)
        in_root_corpus_dir = '%s/%s' % (speech_corpora_dir, speech_corpus)

        scan_audiodir(str(in_root_corpus_dir), transcripts,
                      str(out_wav16_subdir))

        transcripts.save()

        print(speech_corpus, "new transcripts saved.")
        print("\n")