def generate_speech_and_text_corpora(data_dir,
                                     wav16_dir,
                                     debug,
                                     sequitur_model_path,
                                     lexicon_file_name,
                                     audio_corpora,
                                     prompt_words):
    logging.info("loading lexicon...")
    lex = Lexicon(file_name=lexicon_file_name)
    logging.info("loading lexicon...done.")
    logging.info("loading transcripts...")

    if sequitur_model_path:
        add_all = True
    else:
        add_all = False

    ts_all = {}
    ts_train = {}
    ts_test = {}
    transcript_objs = []
    for audio_corpus in audio_corpora:
        transcripts = Transcripts(corpus_name=audio_corpus)

        ts_all_, ts_train_, ts_test_ = transcripts.split(limit=debug, add_all=add_all)

        logging.info("loading transcripts from %s (%d train, %d test) ..." % (audio_corpus, len(ts_train_), len(ts_test_)))

        ts_all.update(ts_all_)
        ts_train.update(ts_train_)
        ts_test.update(ts_test_)
        transcript_objs.append(transcripts)

    logging.info("loading transcripts (%d train, %d test) ...done." % (
        len(ts_train), len(ts_test)))

    export_kaldi_data(wav16_dir, audio_corpora, '%s/train/' % data_dir, ts_train)
    export_kaldi_data(wav16_dir, audio_corpora, '%s/test/' % data_dir, ts_test)

    if sequitur_model_path:
        for transcript_obj in transcript_objs:
            lex = add_missing_words(transcript_obj, lex, sequitur_model_path)

    ps, utt_dict = export_dictionary(ts_all,
                                     lex,
                                     '%s/local/dict/lexicon.txt' % data_dir,
                                     prompt_words)
    write_nonsilence_phones(
        ps, '%s/local/dict/nonsilence_phones.txt' % data_dir)

    write_silence_phones('%s/local/dict/silence_phones.txt' % data_dir)
    write_optional_silence('%s/local/dict/optional_silence.txt' % data_dir)
    write_extra_questions(ps, '%s/local/dict/extra_questions.txt' % data_dir)
    create_training_data_for_language_model(transcript_objs, utt_dict, data_dir)
예제 #2
0
def main(verbose=False, *speech_corpora):
    """Scan directory for audio files and convert them to wav files

    For each speech corpus `speech_corpus`

    1. the resulting wav files are written to the directory
       `.speechrc.wav16`/<speech_corpus>/

    2. the transcripts in data/src/speech/<speech_corpus>/transcripts_*.csv are
       updated.
    """
    misc.init_app('speech_audio_scan')

    config = misc.load_config('.speechrc')

    speech_corpora_dir = Path(config.get("speech", "speech_corpora"))
    wav16 = Path(config.get("speech", "wav16"))

    if len(speech_corpora) < 1:
        logging.error("At least one speech corpus must be provided.")
        sys.exit(1)

    if verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    exit_if_corpus_is_missing(speech_corpora_dir, speech_corpora)

    for speech_corpus in speech_corpora:
        transcripts = Transcripts(corpus_name=speech_corpus, create_db=True)
        out_wav16_subdir = wav16 / speech_corpus
        out_wav16_subdir.mkdir(parents=True, exist_ok=True)
        in_root_corpus_dir = speech_corpora_dir / speech_corpus

        scan_audiodir(str(in_root_corpus_dir), transcripts,
                      str(out_wav16_subdir))

        transcripts.save()
        print speech_corpus, "new transcripts saved."
        print
예제 #3
0
def proc_transcripts(corpus_name):

    global use_prompts, lang

    transcripts = Transcripts(corpus_name=corpus_name)

    if use_prompts:
        transcripts_set = set(
            (u' '.join(tokenize(transcripts[key]["prompt"], lang)))
            for key in transcripts)
    else:
        transcripts_set = set((transcripts[key]["ts"] for key in transcripts))

    for ts in transcripts_set:
        yield ts
#
# load lexicon, transcripts
#

logging.info("loading lexicon...")
lex = Lexicon(file_name=dict_name)
logging.info("loading lexicon...done.")

logging.info("loading transcripts...")
ts_all = {}
ts_train = {}
ts_test = {}
transcripts = {}
cfn2corpus = {}
for audio_corpus in audio_corpora:
    transcripts_ = Transcripts(corpus_name=audio_corpus)

    ts_all_, ts_train_, ts_test_ = transcripts_.split(limit=options.debug)

    logging.info("loading transcripts from %s (%d train, %d test) ..." %
                 (audio_corpus, len(ts_train_), len(ts_test_)))

    ts_all.update(ts_all_)
    ts_train.update(ts_train_)
    ts_test.update(ts_test_)
    transcripts.update(transcripts_)

    for cfn in transcripts_:
        cfn2corpus[cfn] = audio_corpus

logging.info("loading transcripts (%d train, %d test) ...done." %
예제 #5
0
parser.add_option ("-v", "--verbose", action="store_true", dest="verbose",
                   help="enable verbose logging")

(options, args) = parser.parse_args()

if options.verbose:
    logging.basicConfig(level=logging.DEBUG)
else:
    logging.basicConfig(level=logging.INFO)

work_dir = WORKDIR % options.lang

logging.info ('work_dir: %s' % work_dir)

logging.info ("loading transcripts...")
transcripts = Transcripts(lang=options.lang)
logging.info ("loading transcripts... done.")

#
# merge sentences
#

logging.info ('merging sentence sources...')

mkdirs('%s' % work_dir)

num_sentences = 0

train_fn = '%s/train_all.txt' % work_dir

with codecs.open (train_fn, 'w', 'utf8') as dstf:
예제 #6
0
with codecs.open(options.wrt, 'r', 'utf8') as wrtf:
    for line in wrtf:
        parts = line.strip().split(';')
        if len(parts) != 2:
            continue
        wrt[parts[0]] = parts[1]

logging.info(repr(wrt))

#
# load transcripts
#

logging.info("loading transcripts...")
transcripts = Transcripts(corpus_name=corpus_name)
logging.info("loading transcripts...done.")

#
# load lexicon
#

logging.info("loading lexicon...")
lex = Lexicon(file_name=options.dict_name)
logging.info("loading lexicon...done.")

#
# load prompts
#

prompt_tokens    = []
예제 #7
0
logging.info("loading lexicon...done.")

if sequitur_model_path:
    add_all = True
else:
    add_all = False

ts_all = {}
ts_train = {}
ts_test = {}
transcript_objs = []
for audio_corpus in audio_corpora:

    logging.info("loading transcripts from %s ..." % audio_corpus)

    transcripts = Transcripts(corpus_name=audio_corpus)

    ts_all_, ts_train_, ts_test_ = transcripts.split(limit=options.debug,
                                                     add_all=add_all,
                                                     lang=options.lang)

    ts_all.update(ts_all_)
    ts_train.update(ts_train_)
    ts_test.update(ts_test_)
    transcript_objs.append(transcripts)

    logging.info("loading transcripts from %s: %d train, %d test samples." %
                 (audio_corpus, len(ts_train_), len(ts_test_)))

logging.info("loading transcripts done, total: %d train, %d test samples." %
             (len(ts_train), len(ts_test)))
예제 #8
0
else:
    logging.basicConfig(level=logging.INFO)

if len(args) != 1:
    parser.print_usage()
    sys.exit(1)

corpus_in = args[0]
corpus_out = corpus_in + '_phone'

#
# load transcripts
#

logging.info("loading transcripts...")
transcripts = Transcripts(corpus_name=corpus_in)
logging.info("loading transcripts...done.")

#
# config
#

config = misc.load_config('.speechrc')

corpora = config.get("speech", "speech_corpora")
wav16_dir = config.get("speech", "wav16")

out_dir = '%s/%s' % (corpora, corpus_out)
tmpfn_base = '/tmp/tmp16_%08x' % os.getpid()

if os.path.exists(out_dir):
                      action="store_true",
                      dest="verbose",
                      help="verbose output")

    (options, speech_corpora) = parser.parse_args()

    if options.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    if len(speech_corpora) < 1:
        logging.error("At least one speech corpus must be provided.")
        sys.exit(1)

    exit_if_corpus_is_missing(speech_corpora_dir, speech_corpora)

    for speech_corpus in speech_corpora:
        transcripts = Transcripts(corpus_name=speech_corpus, create_db=True)
        out_wav16_subdir = '%s/%s' % (wav16, speech_corpus)
        misc.mkdirs(out_wav16_subdir)
        in_root_corpus_dir = '%s/%s' % (speech_corpora_dir, speech_corpus)

        scan_audiodir(str(in_root_corpus_dir), transcripts,
                      str(out_wav16_subdir))

        transcripts.save()

        print(speech_corpus, "new transcripts saved.")
        print("\n")
예제 #10
0
if len(args) < 1:
    parser.print_help()
    sys.exit(1)

if options.verbose:
    logging.basicConfig(level=logging.DEBUG)
else:
    logging.basicConfig(level=logging.INFO)

#
# load lexicon, transcripts
#

logging.info("loading transcripts...")
transcripts = Transcripts(lang=options.lang)
logging.info("loading transcripts...done.")

#
# main
#

cnt = 0

for csvfn in args:

    logging.info("applying results from %s ..." % csvfn)

    with open(csvfn, 'r') as csvf:

        for line in csvf:
예제 #11
0
data_dir = "%s/data" % work_dir
mfcc_dir = "%s/mfcc" % work_dir

wav16_dir = config.get("speech", "wav16_dir_de")

#
# load lexicon, transcripts
#

print "loading lexicon..."
lex = Lexicon()
print "loading lexicon...done."

print "loading transcripts..."
transcripts = Transcripts()
ts_all, ts_train, ts_test = transcripts.split(limit=DEBUG_LIMIT,
                                              add_all=add_all)
print "loading transcripts (%d train, %d test) ...done." % (len(ts_train),
                                                            len(ts_test))
#
# create work_dir structure
#

utils.mkdirs('%s/lexicon' % data_dir)
utils.mkdirs('%s/local/dict' % data_dir)
utils.mkdirs(wav16_dir)
utils.mkdirs(mfcc_dir)

utils.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir)
utils.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)
예제 #12
0
data_dir    = "%s/data" % work_dir
mfcc_dir    = "%s/mfcc" % work_dir

wav16_dir   = config.get("speech", "wav16_dir_%s" % options.lang)

#
# load lexicon, transcripts
#

logging.info ( "loading lexicon...")
lex = Lexicon(lang=options.lang)
logging.info ( "loading lexicon...done.")

logging.info ( "loading transcripts...")
transcripts = Transcripts(lang=options.lang)
ts_all, ts_train, ts_test = transcripts.split(limit=options.debug, add_all=options.add_all)
logging.info ( "loading transcripts (%d train, %d test) ...done." % (len(ts_train), len(ts_test)))

#
# create work_dir structure
#


misc.mkdirs('%s/lexicon' % data_dir)
misc.mkdirs('%s/local/dict' % data_dir)
misc.mkdirs(wav16_dir)
misc.mkdirs(mfcc_dir)

misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir)
misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)
예제 #13
0
    logging.basicConfig(level=logging.INFO)

#
# config
#

config = misc.load_config('.speechrc')

wav16_dir = config.get("speech", "wav16")

#
# load transcripts
#

logging.info("loading transcripts...")
transcripts = Transcripts(corpus_name=corpus_name)
logging.info("loading transcripts...done.")

logging.info("splitting transcripts...")
ts_all, ts_train, ts_test = transcripts.split()
logging.info("splitting transcripts done, %d train, %d test." %
             (len(ts_train), len(ts_test)))

#
# audio stats
#


def format_duration(duration):
    m, s = divmod(duration, 60)
    h, m = divmod(m, 60)
예제 #14
0
                  dest="verbose",
                  help="verbose output")

(options, args) = parser.parse_args()

if options.verbose:
    logging.basicConfig(level=logging.DEBUG)
else:
    logging.basicConfig(level=logging.INFO)

#
# load transcripts
#

logging.info('loading transcripts from %s ...' % AUDIO_CORPUS)
transcripts = Transcripts(corpus_name=AUDIO_CORPUS)
logging.info('loading transcripts from %s ... done.' % AUDIO_CORPUS)

#
# build set of accepted submissions
#

accept = set()
mics = set(['Kinect-RAW', 'Realtek', 'Samson', 'Yamaha', 'Kinect-Beam'])
for utt_id in transcripts:

    data = transcripts[utt_id]
    if data['quality'] < 2:
        continue

    parts = utt_id.split('-')
            # os.system(cmd)

            # utt2spkf.write('%s %s\n' % (utt_id, ts['spk']))

            utt_num[train_val] = utt_num[train_val] + 1

        cnt += 1
        lcnt += 1
        if cnt % 1000 == 0:
            logging.info(
                '%6d audio files linked from %s [%s] (%6d/%6d)...' %
                (cnt, ts['corpus_name'], train_val, lcnt, len(tsdict)))


utt_num = {'all': 0}

for audio_corpus in audio_corpora:

    logging.info("exporting transcripts from %s ..." % audio_corpus)

    transcripts = Transcripts(corpus_name=audio_corpus)

    ts_all = transcripts.split()

    export_audio('all', ts_all)

    logging.info("exported transcripts from %s: %d samples." %
                 (audio_corpus, len(ts_all)))

logging.info("All done.")
예제 #16
0
def proc_transcripts(corpus_name):
    transcripts = Transcripts(corpus_name=corpus_name)
    transcripts_set = set((transcripts[key]["ts"] for key in transcripts))
    for ts in transcripts_set:
        yield ts
예제 #17
0
            cmd = 'ln -s %s/%s/%s.wav %s/%09d.wav' % (wav16_dir, ts['corpus_name'], utt_id, destdirfn, utt_num[train_val])
            logging.debug(cmd)
            os.system(cmd)

            # utt2spkf.write('%s %s\n' % (utt_id, ts['spk']))

            utt_num[train_val] = utt_num[train_val] + 1

utt_num = { 'train': 0, 'valid': 0 }

for audio_corpus in audio_corpora:

    logging.info("exporting transcripts from %s ..." % audio_corpus)

    transcripts = Transcripts(corpus_name=audio_corpus)

    ts_all, ts_train, ts_test = transcripts.split(limit=options.debug)

    export_audio('train', ts_train)
    export_audio('valid', ts_test)

    logging.info("exported transcripts from %s: %d train, %d test samples." % (audio_corpus, len(ts_train), len(ts_test)))

#
# export dict
#

logging.info("Exporting dictionary...")

utt_dict = {}
예제 #18
0
                  help="enable debug output")

(options, args) = parser.parse_args()

if options.verbose:
    logging.basicConfig(level=logging.DEBUG)
    logging.getLogger("requests").setLevel(logging.WARNING)
else:
    logging.basicConfig(level=logging.INFO)

#
# load transcripts
#

logging.info("loading transcripts...")
transcripts = Transcripts(lang=LANG)
logging.info("loading transcripts...done.")

#
# config
#

config = misc.load_config('.speechrc')

wav16_dir = config.get("speech", "wav16_dir_%s" % LANG)
noise_dir = config.get("speech", "noise_dir")

bg_dir = '%s/bg' % noise_dir
fg_dir = '%s/fg/16kHz' % noise_dir
out_dir = OUT_DIR
예제 #19
0
lex_name = args[0]
corpus_name = args[1]

sequitur_model = SEQUITUR_MODEL % lex_name

#
# load lexicon, transcripts
#

logging.info("loading lexicon...")
lex = Lexicon(file_name=lex_name)
logging.info("loading lexicon...done.")

logging.info("loading transcripts...")
transcripts = Transcripts(corpus_name=corpus_name)
logging.info("loading transcripts...done.")

#
# find missing words
#

missing = {}  # word -> count

num = len(transcripts)
cnt = 0

num_ts_lacking = 0
num_ts_complete = 0

for cfn in transcripts:
예제 #20
0
logging.basicConfig(level=logging.DEBUG)

#
# init terminal
#

reload(sys)
sys.setdefaultencoding('utf-8')
# sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)

#
# load transcripts
#

transcripts = Transcripts()

#
# load lexicon
#

lex = Lexicon()

#
# find missing words
#

missing = {}  # word -> count

num = len(transcripts)
cnt = 0
예제 #21
0
    dest="missing_words",
    help="only work on submissions that have at least one missing word")

(options, args) = parser.parse_args()

ts_filter = None

if len(args) == 1:
    ts_filter = args[0].decode('utf8')

#
# load transcripts
#

print "loading transcripts..."
transcripts = Transcripts()
print "loading transcripts...done."

#
# load lexicon
#

print "loading lexicon..."
lex = Lexicon()
print "loading lexicon...done."

#
# load prompts
#

prompt_tokens = []
예제 #22
0
for a in args:
    ts_filters.append(a.decode('utf8'))

if options.verbose:
    logging.basicConfig(level=logging.DEBUG)
    logging.getLogger("requests").setLevel(logging.WARNING)
else:
    logging.basicConfig(level=logging.INFO)

#
# load transcripts
#

logging.info("loading transcripts...")
transcripts = Transcripts(corpus_name=options.lang)
logging.info("loading transcripts...done.")

#
# load lexicon
#

logging.info("loading lexicon...")
lex = Lexicon(file_name=options.lang)
logging.info("loading lexicon...done.")

#
# load prompts
#

prompt_tokens = []
예제 #23
0
    logging.basicConfig(level=logging.INFO)

#
# config
#

work_dir    = WORKDIR %options.lang 

wav16_dir   = config.get("speech", "wav16_dir_%s" % options.lang)

#
# load transcripts
#

logging.info ( "loading transcripts...")
transcripts = Transcripts(corpus_name=options.lang)
logging.info ( "loading transcripts...done. %d transcripts." % len(transcripts))
logging.info ("splitting transcripts...")
ts_all, ts_train, ts_test = transcripts.split()
logging.info ("splitting transcripts done, %d train, %d test." % (len(ts_train), len(ts_test)))

#
# create work_dir 
#

misc.mkdirs('%s' % work_dir)

# export csv files

csv_train_fn = '%s/train.csv' % work_dir
csv_dev_fn   = '%s/dev.csv'   % work_dir
예제 #24
0
    scan_dirs.append(config.get("speech", "librivoxdir"))

    wav16_dir = config.get("speech", "wav16_dir_en")

else:

    print "***ERROR: language %s not supported yet." % lang
    print
    sys.exit(1)

#
# load transcripts
#

print "loading transcripts..."
transcripts = Transcripts(lang=lang)
print "loading transcripts...done."


def audio_convert(cfn, subdir, fn, audiodir):

    # global mfcc_dir
    global wav16_dir

    # convert audio if not done yet

    w16filename = "%s/%s.wav" % (wav16_dir, cfn)

    if not os.path.isfile(w16filename):

        wavfilename = "%s/%s/wav/%s.wav" % (audiodir, subdir, fn)
with codecs.open(lexfn, 'r', 'utf8') as lexf:
    for line in lexf:
        parts = line.strip().split(' ')
        lex[parts[0]] = ' '.join(parts[1:])

logging.info('reading lexicon %s ... done. %d entries.' % (lexfn, len(lex)))
# print repr(lex)

#
# export audio, prompts (for lm)
#

logging.info("exporting transcripts from %s ..." % audio_corpus)

transcripts = Transcripts(corpus_name=audio_corpus)

utt_num = 0

destdirfn = '%s/test' % data_dir

prompts = set()

for utt_id in transcripts:

    ts = transcripts[utt_id]
    prompts.add(u' '.join(tokenize(transcripts[utt_id]["prompt"],
                                   options.lang)))

    if ts['quality'] != 0:
        continue
예제 #26
0
파일: speech_stats.py 프로젝트: ilibx/nlp
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)

#
# config
#

config = utils.load_config()

wav16_dir = config.get("speech", "wav16_dir_%s" % LANG)

#
# load transcripts
#

print "loading transcripts..."
transcripts = Transcripts(lang=LANG)
print "loading transcripts...done."

#
# load lexicon
#

print "loading lexicon..."
lex = Lexicon(lang=LANG)
print "loading lexicon...done."

#
# lexicon stats
#

print