def generate_speech_and_text_corpora(data_dir, wav16_dir, debug, sequitur_model_path, lexicon_file_name, audio_corpora, prompt_words): logging.info("loading lexicon...") lex = Lexicon(file_name=lexicon_file_name) logging.info("loading lexicon...done.") logging.info("loading transcripts...") if sequitur_model_path: add_all = True else: add_all = False ts_all = {} ts_train = {} ts_test = {} transcript_objs = [] for audio_corpus in audio_corpora: transcripts = Transcripts(corpus_name=audio_corpus) ts_all_, ts_train_, ts_test_ = transcripts.split(limit=debug, add_all=add_all) logging.info("loading transcripts from %s (%d train, %d test) ..." % (audio_corpus, len(ts_train_), len(ts_test_))) ts_all.update(ts_all_) ts_train.update(ts_train_) ts_test.update(ts_test_) transcript_objs.append(transcripts) logging.info("loading transcripts (%d train, %d test) ...done." % ( len(ts_train), len(ts_test))) export_kaldi_data(wav16_dir, audio_corpora, '%s/train/' % data_dir, ts_train) export_kaldi_data(wav16_dir, audio_corpora, '%s/test/' % data_dir, ts_test) if sequitur_model_path: for transcript_obj in transcript_objs: lex = add_missing_words(transcript_obj, lex, sequitur_model_path) ps, utt_dict = export_dictionary(ts_all, lex, '%s/local/dict/lexicon.txt' % data_dir, prompt_words) write_nonsilence_phones( ps, '%s/local/dict/nonsilence_phones.txt' % data_dir) write_silence_phones('%s/local/dict/silence_phones.txt' % data_dir) write_optional_silence('%s/local/dict/optional_silence.txt' % data_dir) write_extra_questions(ps, '%s/local/dict/extra_questions.txt' % data_dir) create_training_data_for_language_model(transcript_objs, utt_dict, data_dir)
def proc_transcripts(corpus_name): global use_prompts, lang transcripts = Transcripts(corpus_name=corpus_name) if use_prompts: transcripts_set = set( (u' '.join(tokenize(transcripts[key]["prompt"], lang))) for key in transcripts) else: transcripts_set = set((transcripts[key]["ts"] for key in transcripts)) for ts in transcripts_set: yield ts
def main(verbose=False, *speech_corpora): """Scan directory for audio files and convert them to wav files For each speech corpus `speech_corpus` 1. the resulting wav files are written to the directory `.speechrc.wav16`/<speech_corpus>/ 2. the transcripts in data/src/speech/<speech_corpus>/transcripts_*.csv are updated. """ misc.init_app('speech_audio_scan') config = misc.load_config('.speechrc') speech_corpora_dir = Path(config.get("speech", "speech_corpora")) wav16 = Path(config.get("speech", "wav16")) if len(speech_corpora) < 1: logging.error("At least one speech corpus must be provided.") sys.exit(1) if verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) exit_if_corpus_is_missing(speech_corpora_dir, speech_corpora) for speech_corpus in speech_corpora: transcripts = Transcripts(corpus_name=speech_corpus, create_db=True) out_wav16_subdir = wav16 / speech_corpus out_wav16_subdir.mkdir(parents=True, exist_ok=True) in_root_corpus_dir = speech_corpora_dir / speech_corpus scan_audiodir(str(in_root_corpus_dir), transcripts, str(out_wav16_subdir)) transcripts.save() print speech_corpus, "new transcripts saved." print
with codecs.open(lexfn, 'r', 'utf8') as lexf: for line in lexf: parts = line.strip().split(' ') lex[parts[0]] = ' '.join(parts[1:]) logging.info('reading lexicon %s ... done. %d entries.' % (lexfn, len(lex))) # print repr(lex) # # export audio, prompts (for lm) # logging.info("exporting transcripts from %s ..." % audio_corpus) transcripts = Transcripts(corpus_name=audio_corpus) utt_num = 0 destdirfn = '%s/test' % data_dir prompts = set() for utt_id in transcripts: ts = transcripts[utt_id] prompts.add(u' '.join(tokenize(transcripts[utt_id]["prompt"], options.lang))) if ts['quality'] != 0: continue
else: logging.basicConfig(level=logging.INFO) if len(args) != 1: parser.print_usage() sys.exit(1) corpus_in = args[0] corpus_out = corpus_in + '_phone' # # load transcripts # logging.info("loading transcripts...") transcripts = Transcripts(corpus_name=corpus_in) logging.info("loading transcripts...done.") # # config # config = misc.load_config('.speechrc') corpora = config.get("speech", "speech_corpora") wav16_dir = config.get("speech", "wav16") out_dir = '%s/%s' % (corpora, corpus_out) tmpfn_base = '/tmp/tmp16_%08x' % os.getpid() if os.path.exists(out_dir):
data_dir = "%s/data" % work_dir mfcc_dir = "%s/mfcc" % work_dir wav16_dir = config.get("speech", "wav16_dir_de") # # load lexicon, transcripts # print "loading lexicon..." lex = Lexicon() print "loading lexicon...done." print "loading transcripts..." transcripts = Transcripts() ts_all, ts_train, ts_test = transcripts.split(limit=DEBUG_LIMIT, add_all=add_all) print "loading transcripts (%d train, %d test) ...done." % (len(ts_train), len(ts_test)) # # create work_dir structure # utils.mkdirs('%s/lexicon' % data_dir) utils.mkdirs('%s/local/dict' % data_dir) utils.mkdirs(wav16_dir) utils.mkdirs(mfcc_dir) utils.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir) utils.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)
if len(args) < 1: parser.print_help() sys.exit(1) if options.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) # # load lexicon, transcripts # logging.info("loading transcripts...") transcripts = Transcripts(lang=options.lang) logging.info("loading transcripts...done.") # # main # cnt = 0 for csvfn in args: logging.info("applying results from %s ..." % csvfn) with open(csvfn, 'r') as csvf: for line in csvf:
scan_dirs.append(config.get("speech", "librivoxdir")) wav16_dir = config.get("speech", "wav16_dir_en") else: print "***ERROR: language %s not supported yet." % lang print sys.exit(1) # # load transcripts # print "loading transcripts..." transcripts = Transcripts(lang=lang) print "loading transcripts...done." def audio_convert(cfn, subdir, fn, audiodir): # global mfcc_dir global wav16_dir # convert audio if not done yet w16filename = "%s/%s.wav" % (wav16_dir, cfn) if not os.path.isfile(w16filename): wavfilename = "%s/%s/wav/%s.wav" % (audiodir, subdir, fn)
logging.basicConfig(level=logging.INFO) # # config # work_dir = WORKDIR %options.lang wav16_dir = config.get("speech", "wav16_dir_%s" % options.lang) # # load transcripts # logging.info ( "loading transcripts...") transcripts = Transcripts(corpus_name=options.lang) logging.info ( "loading transcripts...done. %d transcripts." % len(transcripts)) logging.info ("splitting transcripts...") ts_all, ts_train, ts_test = transcripts.split() logging.info ("splitting transcripts done, %d train, %d test." % (len(ts_train), len(ts_test))) # # create work_dir # misc.mkdirs('%s' % work_dir) # export csv files csv_train_fn = '%s/train.csv' % work_dir csv_dev_fn = '%s/dev.csv' % work_dir
def proc_transcripts(corpus_name): transcripts = Transcripts(corpus_name=corpus_name) transcripts_set = set((transcripts[key]["ts"] for key in transcripts)) for ts in transcripts_set: yield ts
help="enable debug output") (options, args) = parser.parse_args() if options.verbose: logging.basicConfig(level=logging.DEBUG) logging.getLogger("requests").setLevel(logging.WARNING) else: logging.basicConfig(level=logging.INFO) # # load transcripts # logging.info("loading transcripts...") transcripts = Transcripts(lang=LANG) logging.info("loading transcripts...done.") # # config # config = misc.load_config('.speechrc') wav16_dir = config.get("speech", "wav16_dir_%s" % LANG) noise_dir = config.get("speech", "noise_dir") bg_dir = '%s/bg' % noise_dir fg_dir = '%s/fg/16kHz' % noise_dir out_dir = OUT_DIR
dest="verbose", help="verbose output") (options, args) = parser.parse_args() if options.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) # # load transcripts # logging.info('loading transcripts from %s ...' % AUDIO_CORPUS) transcripts = Transcripts(corpus_name=AUDIO_CORPUS) logging.info('loading transcripts from %s ... done.' % AUDIO_CORPUS) # # build set of accepted submissions # accept = set() mics = set(['Kinect-RAW', 'Realtek', 'Samson', 'Yamaha', 'Kinect-Beam']) for utt_id in transcripts: data = transcripts[utt_id] if data['quality'] < 2: continue parts = utt_id.split('-')
action="store_true", dest="verbose", help="verbose output") (options, speech_corpora) = parser.parse_args() if options.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) if len(speech_corpora) < 1: logging.error("At least one speech corpus must be provided.") sys.exit(1) exit_if_corpus_is_missing(speech_corpora_dir, speech_corpora) for speech_corpus in speech_corpora: transcripts = Transcripts(corpus_name=speech_corpus, create_db=True) out_wav16_subdir = '%s/%s' % (wav16, speech_corpus) misc.mkdirs(out_wav16_subdir) in_root_corpus_dir = '%s/%s' % (speech_corpora_dir, speech_corpus) scan_audiodir(str(in_root_corpus_dir), transcripts, str(out_wav16_subdir)) transcripts.save() print(speech_corpus, "new transcripts saved.") print("\n")