def sick_corpus(sick_dict, basic_dir, generated_dir): output_directory = os.path.join(generated_dir, 'sickcorpus') corpus = Corpus(basic_dir, output_directory, num_jobs = 2) corpus.write() corpus.create_mfccs() corpus.setup_splits(sick_dict) return corpus
def sick_corpus(sick_dict, basic_dir, generated_dir): output_directory = os.path.join(generated_dir, 'sickcorpus') corpus = Corpus(basic_dir, output_directory, num_jobs=2) corpus.write() corpus.create_mfccs() corpus.setup_splits(sick_dict) return corpus
def test_stereo(basic_dict_path, textgrid_directory, generated_dir): temp = os.path.join(generated_dir, 'stereo') dictionary = Dictionary(basic_dict_path, os.path.join(temp, 'basic')) dictionary.write() d = Corpus(os.path.join(textgrid_directory, 'stereo'), temp) d.write() d.create_mfccs() d.setup_splits(dictionary) assert (d.get_feat_dim() == '39')
def test_basic(basic_dict_path, basic_dir, generated_dir): dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, "basic")) dictionary.write() output_directory = os.path.join(generated_dir, "basic") d = Corpus(basic_dir, output_directory) d.write() d.create_mfccs() d.setup_splits(dictionary) assert d.get_feat_dim() == "39"
def test_stereo(basic_dict_path, textgrid_directory, generated_dir): temp = os.path.join(generated_dir, "stereo") dictionary = Dictionary(basic_dict_path, os.path.join(temp, "basic")) dictionary.write() d = Corpus(os.path.join(textgrid_directory, "stereo"), temp) d.write() d.create_mfccs() d.setup_splits(dictionary) assert d.get_feat_dim() == "39"
def test_basic(basic_dict_path, basic_dir, generated_dir): dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) dictionary.write() output_directory = os.path.join(generated_dir, 'basic') d = Corpus(basic_dir, output_directory) d.write() d.create_mfccs() d.setup_splits(dictionary) assert (d.get_feat_dim() == '39')
def test_acoustic(basic_dir, generated_dir): output_directory = os.path.join(generated_dir, 'acoustic') d = Corpus(basic_dir, output_directory) d.write() d.create_mfccs() n = no_dictionary(d, output_directory) d.setup_splits(n) assert n.words['should'] == [['s', 'h', 'o', 'u', 'l', 'd']] assert '<vocnoise>' not in n.words assert n.words['here\'s'] == [['h', 'e', 'r', 'e', 's']]
def test_vietnamese(textgrid_directory, generated_dir): output_directory = os.path.join(generated_dir, 'vietnamese') d = Corpus(os.path.join(textgrid_directory, 'vietnamese'), output_directory) d.write() d.create_mfccs() n = no_dictionary(d, output_directory) d.setup_splits(n) assert n.words['chăn'] == [['c', 'h', 'ă', 'n']] assert '<vocnoise>' not in n.words assert n.words['tập'] == [['t','ậ','p']]
def align_corpus(corpus_dir, dict_path, output_directory, temp_dir, output_model_path, args): if temp_dir == '': temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(temp_dir) corpus_name = os.path.basename(corpus_dir) if corpus_name == '': corpus_dir = os.path.dirname(corpus_dir) corpus_name = os.path.basename(corpus_dir) data_directory = os.path.join(temp_dir, corpus_name) if args.clean: shutil.rmtree(data_directory, ignore_errors=True) shutil.rmtree(output_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) os.makedirs(output_directory, exist_ok=True) dictionary = Dictionary(dict_path, data_directory) dictionary.write() corpus = Corpus(corpus_dir, data_directory, args.speaker_characters, num_jobs=args.num_jobs) print(corpus.speaker_utterance_info()) corpus.write() corpus.create_mfccs() corpus.setup_splits(dictionary) utt_oov_path = os.path.join(corpus.split_directory, 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, output_directory) oov_path = os.path.join(corpus.split_directory, 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, output_directory) mono_params = {'align_often': not args.fast} tri_params = {'align_often': not args.fast} tri_fmllr_params = {'align_often': not args.fast} a = TrainableAligner(corpus, dictionary, output_directory, temp_directory=data_directory, mono_params=mono_params, tri_params=tri_params, tri_fmllr_params=tri_fmllr_params, num_jobs=args.num_jobs) a.verbose = args.verbose a.train_mono() a.export_textgrids() a.train_tri() a.export_textgrids() a.train_tri_fmllr() a.export_textgrids() if output_model_path is not None: a.save(output_model_path)
def align_corpus(model_path, corpus_dir, output_directory, temp_dir, args, debug = False): all_begin = time.time() if temp_dir == '': temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(temp_dir) corpus_name = os.path.basename(corpus_dir) if corpus_name == '': corpus_dir = os.path.dirname(corpus_dir) corpus_name = os.path.basename(corpus_dir) data_directory = os.path.join(temp_dir, corpus_name) if args.clean: shutil.rmtree(data_directory, ignore_errors = True) shutil.rmtree(output_directory, ignore_errors = True) os.makedirs(data_directory, exist_ok = True) os.makedirs(output_directory, exist_ok = True) begin = time.time() corpus = Corpus(corpus_dir, data_directory, args.speaker_characters, num_jobs = args.num_jobs) print(corpus.speaker_utterance_info()) corpus.write() if debug: print('Wrote corpus information in {} seconds'.format(time.time() - begin)) begin = time.time() corpus.create_mfccs() if debug: print('Calculated mfccs in {} seconds'.format(time.time() - begin)) archive = Archive(model_path) begin = time.time() a = PretrainedAligner(archive, corpus, output_directory, temp_directory = data_directory, num_jobs = args.num_jobs, speaker_independent = args.no_speaker_adaptation) if debug: print('Setup pretrained aligner in {} seconds'.format(time.time() - begin)) a.verbose = args.verbose begin = time.time() corpus.setup_splits(a.dictionary) if debug: print('Setup splits in {} seconds'.format(time.time() - begin)) utt_oov_path = os.path.join(corpus.split_directory, 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, output_directory) oov_path = os.path.join(corpus.split_directory, 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, output_directory) begin = time.time() a.do_align() if debug: print('Performed alignment in {} seconds'.format(time.time() - begin)) begin = time.time() a.export_textgrids() if debug: print('Exported textgrids in {} seconds'.format(time.time() - begin)) print('Done! Everything took {} seconds'.format(time.time() - all_begin))
def align_corpus(corpus_dir, dict_path, output_directory, temp_dir, output_model_path, args): if temp_dir == '': temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(temp_dir) corpus_name = os.path.basename(corpus_dir) if corpus_name == '': corpus_dir = os.path.dirname(corpus_dir) corpus_name = os.path.basename(corpus_dir) data_directory = os.path.join(temp_dir, corpus_name) if args.clean: shutil.rmtree(data_directory, ignore_errors = True) shutil.rmtree(output_directory, ignore_errors = True) os.makedirs(data_directory, exist_ok = True) os.makedirs(output_directory, exist_ok = True) corpus = Corpus(corpus_dir, data_directory, args.speaker_characters, num_jobs = args.num_jobs) print(corpus.speaker_utterance_info()) corpus.write() corpus.create_mfccs() dictionary = Dictionary(dict_path, data_directory, word_set=corpus.word_set) dictionary.write() corpus.setup_splits(dictionary) utt_oov_path = os.path.join(corpus.split_directory, 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, output_directory) oov_path = os.path.join(corpus.split_directory, 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, output_directory) mono_params = {'align_often': not args.fast} tri_params = {'align_often': not args.fast} tri_fmllr_params = {'align_often': not args.fast} a = TrainableAligner(corpus, dictionary, output_directory, temp_directory = data_directory, mono_params = mono_params, tri_params = tri_params, tri_fmllr_params = tri_fmllr_params, num_jobs = args.num_jobs) a.verbose = args.verbose a.train_mono() a.export_textgrids() a.train_tri() a.export_textgrids() a.train_tri_fmllr() a.export_textgrids() if output_model_path is not None: a.save(output_model_path)
def test_extra(sick_dict, extra_dir, generated_dir): output_directory = os.path.join(generated_dir, "extra") corpus = Corpus(extra_dir, output_directory, num_jobs=2) corpus.write() corpus.create_mfccs() corpus.setup_splits(sick_dict)