def main(verbose=False, *speech_corpora): """Scan directory for audio files and convert them to wav files For each speech corpus `speech_corpus` 1. the resulting wav files are written to the directory `.speechrc.wav16`/<speech_corpus>/ 2. the transcripts in data/src/speech/<speech_corpus>/transcripts_*.csv are updated. """ misc.init_app('speech_audio_scan') config = misc.load_config('.speechrc') speech_corpora_dir = Path(config.get("speech", "speech_corpora")) wav16 = Path(config.get("speech", "wav16")) if len(speech_corpora) < 1: logging.error("At least one speech corpus must be provided.") sys.exit(1) if verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) exit_if_corpus_is_missing(speech_corpora_dir, speech_corpora) for speech_corpus in speech_corpora: transcripts = Transcripts(corpus_name=speech_corpus, create_db=True) out_wav16_subdir = wav16 / speech_corpus out_wav16_subdir.mkdir(parents=True, exist_ok=True) in_root_corpus_dir = speech_corpora_dir / speech_corpus scan_audiodir(str(in_root_corpus_dir), transcripts, str(out_wav16_subdir)) transcripts.save() print speech_corpus, "new transcripts saved." print
t = missing_token else: t = tokenize(ts['ts'])[0] lex_edit(t) # # fini # curses.nocbreak() stdscr.keypad(0) curses.echo() curses.endwin() transcripts.save() print "new transcripts saved." print lex.save() print "new lexicon saved." print except: curses.nocbreak() stdscr.keypad(0) curses.echo() curses.endwin() print u"*** ERROR: Unexpected error:", sys.exc_info()[0] traceback.print_exc()