def training(prefix_name, returnn_exe, returnn_root, **kwargs): returnn_config = get_config(**kwargs) train_dataset, cv_dataset, devtrain_dataset, extern_data = build_training_datasets( returnn_python_exe=returnn_exe, returnn_root=returnn_root, output_path=prefix_name) returnn_config.config["extern_data"] = extern_data returnn_config.config["train"] = train_dataset.as_returnn_opts() returnn_config.config["dev"] = cv_dataset.as_returnn_opts() from i6_core.returnn.training import ReturnnTrainingJob default_rqmt = { 'mem_rqmt': 15, 'time_rqmt': 168, 'log_verbosity': 5, 'returnn_python_exe': returnn_exe, 'returnn_root': returnn_root, } train_job = ReturnnTrainingJob(returnn_config=returnn_config, num_epochs=250, **default_rqmt) train_job.add_alias(prefix_name + "/training") tk.register_output(prefix_name + "/learning_rates", train_job.out_learning_rates)
def get_tts_dataset_stats(zip_dataset): config = { 'train': { 'class': 'OggZipDataset', 'audio': { 'feature_options': { 'fmin': 60 }, 'features': 'db_mel_filterbank', 'num_feature_filters': 80, 'peak_normalization': False, 'preemphasis': 0.97, 'step_len': 0.0125, 'window_len': 0.05 }, 'targets': None, 'path': zip_dataset } } from recipe.returnn.dataset import ExtractDatasetStats dataset_stats_job = ExtractDatasetStats(config) dataset_stats_job.add_alias("data/tts_stats/ExtractDatasetStats") mean = dataset_stats_job.mean std_dev = dataset_stats_job.std_dev tk.register_output('data/tts_stats/norm.mean.txt', mean) tk.register_output('data/tts_stats/norm.std_dev.txt', std_dev) return mean, std_dev
def prepare_tts_data(bliss_dict): """ :param dict bliss_dict: :return: """ from recipe.returnn.vocabulary import BuildCharacterVocabulary build_char_vocab_job = BuildCharacterVocabulary(uppercase=True) char_vocab = build_char_vocab_job.out processed_corpora = {} processed_zip_corpora = {} for name, corpus in bliss_dict.items(): tts_name = "tts-" + name processed_corpus = process_corpus(bliss_corpus=corpus, char_vocab=char_vocab, silence_duration=0.1, name=tts_name) processed_corpora[tts_name] = processed_corpus tk.register_output("data/bliss/%s.processed.xml.gz" % name, processed_corpus) processed_zip_corpora[tts_name] = BlissToZipDataset( tts_name, processed_corpus).out return processed_corpora, processed_zip_corpora, char_vocab
def ctc_test_dimtag(): ctc_lexicon = create_regular_lexicon() tk.register_output("experiments/librispeech_100_ctc/ctc_lexicon.xml", ctc_lexicon) recog_args = get_default_recog_args() recog_args = copy.deepcopy(recog_args) training_args = copy.deepcopy(get_default_training_args()) training_args["returnn_root"] = CloneGitRepositoryJob( "https://github.com/rwth-i6/returnn", commit="d030cdeb573a4cbe5504bce5cd48d275a9ff5d7f").out_repository system = CtcSystem( returnn_config=get_returnn_config(use_dimtags=True), default_training_args=training_args, recognition_args=recog_args, rasr_python_home= '/work/tools/asr/python/3.8.0_tf_2.3-v1-generic+cuda10.1', rasr_python_exe= '/work/tools/asr/python/3.8.0_tf_2.3-v1-generic+cuda10.1/bin/python', ) train_data, dev_data, test_data = get_corpus_data_inputs( delete_empty_orth=True) gs.ALIAS_AND_OUTPUT_SUBDIR = "experiments/librispeech/librispeech_100_ctc/ctc_test_dimtag" system.init_system(rasr_init_args=rasr_args, train_data=train_data, dev_data=dev_data, test_data=test_data) system.run(("extract", "train", "recog")) gs.ALIAS_AND_OUTPUT_SUBDIR = ""
def training(prefix_name, returnn_config, returnn_exe, returnn_root, num_epochs=250): """ :param prefix_name: :param returnn_config: :param returnn_exe: :param returnn_root: :return: """ default_rqmt = { 'mem_rqmt': 15, 'time_rqmt': 168, 'log_verbosity': 5, 'returnn_python_exe': returnn_exe, 'returnn_root': returnn_root, } train_job = ReturnnTrainingJob(returnn_config=returnn_config, num_epochs=num_epochs, **default_rqmt) train_job.add_alias(prefix_name + "/training") tk.register_output(prefix_name + "/learning_rates", train_job.out_learning_rates) return train_job
def test(): returnn_exe = tk.Path("/u/rossenbach/bin/returnn_tf2.3_launcher_custom.sh", hash_overwrite="GENERIC_RETURNN_LAUNCHER") returnn_root = CloneGitRepositoryJob( "https://github.com/rwth-i6/returnn", commit="f4f88b02f8e50996eedc475e4ce9006206a52394").out_repository prefix_name = "new_training_test_v2" returnn_config = get_config() train_dataset, cv_dataset, devtrain_dataset, extern_data = build_training_datasets( returnn_python_exe=returnn_exe, returnn_root=returnn_root, output_path=prefix_name) returnn_config.config["extern_data"] = extern_data returnn_config.config["train"] = train_dataset.as_returnn_opts() returnn_config.config["dev"] = cv_dataset.as_returnn_opts() from i6_core.returnn.training import ReturnnTrainingJob default_rqmt = { 'mem_rqmt': 15, 'time_rqmt': 80, 'log_verbosity': 5, 'returnn_python_exe': returnn_exe, 'returnn_root': returnn_root, } train_job = ReturnnTrainingJob(returnn_config=returnn_config, num_epochs=250, **default_rqmt) train_job.add_alias(prefix_name + "/training") tk.register_output(prefix_name + "/learning_rates", train_job.out_learning_rates)
def search_single(prefix_name, returnn_config, checkpoint, recognition_dataset, recognition_reference, returnn_exe, returnn_root): """ Run search for a specific test dataset :param str prefix_name: :param ReturnnConfig returnn_config: :param Checkpoint checkpoint: :param returnn_standalone.data.datasets.dataset.GenericDataset recognition_dataset: :param Path recognition_reference: Path to a py-dict format reference file :param Path returnn_exe: :param Path returnn_root: """ from i6_core.returnn.search import ReturnnSearchJob, SearchBPEtoWordsJob, ReturnnComputeWERJob from i6_experiments.users.rossenbach.returnn.config import get_specific_returnn_config search_job = ReturnnSearchJob( search_data=recognition_dataset.as_returnn_opts(), model_checkpoint=checkpoint, returnn_config=get_specific_returnn_config(returnn_config), log_verbosity=5, mem_rqmt=8, returnn_python_exe=returnn_exe, returnn_root=returnn_root) search_job.add_alias(prefix_name + "/search_job") search_words = SearchBPEtoWordsJob( search_job.out_search_file).out_word_search_results wer = ReturnnComputeWERJob(search_words, recognition_reference) tk.register_output(prefix_name + "/search_out_words.py", search_words) tk.register_output(prefix_name + "/wer", wer.out_wer)
def apply_uppercase_cmu_corpus_processing(bliss_corpus): """ Applies the LJSpeech processing pipeline using the CMU dictionary, default Sequitur G2P and the the special symbols as defined in `get_static_lexicon()` :param Path bliss_corpus: :param path_prefix: :return: the fully preprocess bliss corpus, ready to be used with a word-based RETURNN vocabulary :rtype: Path """ cmu_bliss_lexicon = get_uppercase_bliss_lexicon(apply_g2p=False) cmu_uppercase_g2p = get_uppercase_cmu_g2p() processed_bliss_corpus = apply_corpus_pre_processing(bliss_corpus) oovs = ExtractOovWordsFromCorpusJob(processed_bliss_corpus, cmu_bliss_lexicon).out_oov_words g2p_oov_lexicon = ApplyG2PModelJob(cmu_uppercase_g2p, oovs).out_g2p_lexicon complete_bliss_lexcion = G2POutputToBlissLexiconJob( cmu_bliss_lexicon, g2p_oov_lexicon, merge=True).out_oov_lexicon tk.register_output("ljspeech_test/complete-lexicon.xml.gz", complete_bliss_lexcion) converted_bliss_corpus = ApplyLexiconToTranscriptions( processed_bliss_corpus, complete_bliss_lexcion, word_separation_orth="[space]").out_corpus tk.register_output("ljspeech_test/converted_corpus.xml.gz", converted_bliss_corpus) return converted_bliss_corpus
def get_asr_dataset_stats(zip_dataset): """ This function computes the global dataset statistics (mean and stddev) on a zip corpus to be used in the training dataset parameters of the OggZipDataset :param zip_dataset: :return: """ config = { 'train': { 'class': 'OggZipDataset', 'audio': {}, 'targets': None, 'path': zip_dataset } } from recipe.returnn.dataset import ExtractDatasetStats dataset_stats_job = ExtractDatasetStats(config) dataset_stats_job.add_alias("data/stats/ExtractDatasetStats") mean = dataset_stats_job.mean_file std_dev = dataset_stats_job.std_dev_file tk.register_output('data/stats/norm.mean.txt', mean) tk.register_output('data/stats/norm.std_dev.txt', std_dev) return mean, std_dev
def generate_speaker_embeddings(config_file, model_dir, epoch, zip_corpus, name, default_parameter_dict=None): from recipe.returnn.forward import RETURNNForwardFromFile parameter_dict = { 'ext_gen_speakers': True, 'ext_model': model_dir, 'ext_load_epoch': epoch, 'ext_eval_zip': zip_corpus } parameter_dict.update(default_parameter_dict) generate_speaker_embeddings_job = RETURNNForwardFromFile( config_file, parameter_dict=parameter_dict, hdf_outputs=['speaker_embeddings'], mem_rqmt=8) generate_speaker_embeddings_job.add_alias("tts_speaker_generation/" + name) tk.register_output( "tts_speaker_generation/" + name + "_speakers.hdf", generate_speaker_embeddings_job.outputs['speaker_embeddings']) return generate_speaker_embeddings_job.outputs['speaker_embeddings']
def test(): path = "experiments/librispeech/librispeech_100_ar_tts/prototype_pipeline" dataset_group = get_silence_processed_dataset_group() train_corpus = dataset_group.get_segmented_corpus_object( "train-clean-100-tts-train")[0].corpus_file tk.register_output(os.path.join(path, "processed_train_corpus.xml.gz"), train_corpus)
def _export_lm_data(output_prefix): """ :param str output_prefix: """ lm_dict = get_arpa_lm_dict(output_prefix=output_prefix) tk.register_output( os.path.join(output_prefix, "LibriSpeech", "lm", "3-gram.arpa.gz"), lm_dict["3gram"], ) tk.register_output( os.path.join(output_prefix, "LibriSpeech", "lm", "4-gram.arpa.gz"), lm_dict["4gram"], )
def train_ttf_config(config, name, parameter_dict=None): from recipe.returnn import RETURNNTrainingFromFile asr_train = RETURNNTrainingFromFile(config, parameter_dict=parameter_dict, mem_rqmt=16) asr_train.add_alias("tts_training/" + name) # TODO: Remove asr_train.rqmt['qsub_args'] = '-l qname=%s' % "*080*" asr_train.rqmt['time'] = 167 asr_train.rqmt['cpu'] = 8 tk.register_output("tts_training/" + name + "_model", asr_train.model_dir) tk.register_output("tts_training/" + name + "_training-scores", asr_train.learning_rates) return asr_train
def training(prefix_name, returnn_exe, returnn_root, **kwargs): returnn_config = get_config(**kwargs) train_dataset, cv_dataset, devtrain_dataset, extern_data = build_training_datasets( returnn_python_exe=returnn_exe, returnn_root=returnn_root, output_path=prefix_name) returnn_config.config["extern_data"] = extern_data returnn_config.config["train"] = train_dataset.as_returnn_opts() returnn_config.config["dev"] = cv_dataset.as_returnn_opts() from i6_core.returnn.training import ReturnnTrainingJob default_rqmt = { 'mem_rqmt': 15, 'time_rqmt': 168, 'log_verbosity': 5, 'returnn_python_exe': returnn_exe, 'returnn_root': returnn_root, } train_job = ReturnnTrainingJob(returnn_config=returnn_config, num_epochs=250, **default_rqmt) train_job.add_alias(prefix_name + "/training") tk.register_output(prefix_name + "/learning_rates", train_job.out_learning_rates) from i6_core.returnn.search import ReturnnSearchJob, SearchBPEtoWordsJob, ReturnnComputeWERJob from i6_experiments.users.rossenbach.returnn.config import get_specific_returnn_config dev_other, dev_other_reference = build_test_dataset( "dev-other", returnn_python_exe=returnn_exe, returnn_root=returnn_root, output_path=prefix_name) search_job = ReturnnSearchJob( search_data=dev_other.as_returnn_opts(), model_checkpoint=train_job.out_checkpoints[250], returnn_config=get_specific_returnn_config(returnn_config), log_verbosity=5, mem_rqmt=8, returnn_python_exe=returnn_exe, returnn_root=returnn_root) search_words = SearchBPEtoWordsJob( search_job.out_search_file).out_word_search_results wer = ReturnnComputeWERJob(search_words, dev_other_reference) tk.register_output(prefix_name + "/dummy_wer", wer.out_wer)
def prepare_data_librispeech(): """ This function creates the LibriSpeech data in Bliss format and zip format. For the evaluation sets, the text is extracted in dictionary form for WER scoring :return: """ # all datasets that are used in the experiments for LibriSpeech dataset_names = [ 'dev-clean', 'dev-other', 'test-clean', 'test-other', 'train-clean-100', 'train-clean-360' ] evaluation_names = ['dev-clean', 'dev-other', 'test-clean', 'test-other'] bliss_flac_corpus_dict = {} zip_flac_corpus_dict = {} transcription_corpus_dict = {} for dataset_name in dataset_names: dataset_path = Path("../data/dataset-raw/LibriSpeech/%s/" % dataset_name) # open the raw LibriSpeech data and create bliss corpus ls_to_bliss_job = LibriSpeechToBliss(corpus_path=dataset_path, name=dataset_name) ls_to_bliss_job.add_alias("data/LibriSpeechToBliss/%s" % dataset_name) bliss_flac_corpus_dict[dataset_name] = ls_to_bliss_job.out tk.register_output("data/bliss/%s.xml.gz" % dataset_name, ls_to_bliss_job.out) # create a unified zip corpus file from the bliss corpus bliss_to_zip_job = BlissToZipDataset(name=dataset_name, corpus_file=ls_to_bliss_job.out, use_full_seq_name=False) bliss_to_zip_job.add_alias("data/BlissToZipDataset/%s" % dataset_name) zip_flac_corpus_dict[dataset_name] = bliss_to_zip_job.out tk.register_output("data/asr_zip/%s.zip" % dataset_name, bliss_to_zip_job.out) for dataset_name in evaluation_names: # create the dictionary format transcription files bliss_to_text_dict_job = BlissExtractTextDictionary( bliss_flac_corpus_dict[dataset_name], segment_key_only=True) bliss_to_text_dict_job.add_alias("data/BlissExtractTextDictionary/%s" % dataset_name) transcription_corpus_dict[dataset_name] = bliss_to_text_dict_job.out return bliss_flac_corpus_dict, zip_flac_corpus_dict, transcription_corpus_dict
def griffin_lim_ogg(linear_hdf, name, iterations=1): from recipe.tts.toolchain import GriffinLim gl_job = GriffinLim( linear_hdf, iterations=iterations, sample_rate=16000, window_shift=0.0125, window_size=0.05, preemphasis=0.97, ) gl_job.add_alias("gl_conversion/" + name) tk.register_output("generated_audio/" + name + "_audio", gl_job.out_folder) return gl_job.out_corpus, gl_job
def train_f2l_config(config_file, name, parameter_dict=None): from recipe.returnn import RETURNNTrainingFromFile f2l_train = RETURNNTrainingFromFile(config_file, parameter_dict=parameter_dict, mem_rqmt=16) f2l_train.add_alias("f2l_training/" + name) # f2l_train.rqmt['qsub_args'] = '-l qname=%s' % "*080*" f2l_train.rqmt['time'] = 96 f2l_train.rqmt['cpu'] = 8 tk.register_output("f2l_training/" + name + "_model", f2l_train.model_dir) tk.register_output("f2l_training/" + name + "_training-scores", f2l_train.learning_rates) return f2l_train
def get_returnn_subword_nmt(commit_hash, output_prefix=""): """ :param str commit_hash: :return: subword-nmt repo path :rtype tk.Path """ subword_nmt_job = CloneGitRepositoryJob( url="https://github.com/albertz/subword-nmt", commit=commit_hash, checkout_folder_name="subword-nmt", ) subword_nmt_job.add_alias(os.path.join(output_prefix, "clone_subword_nmt")) tk.register_output(os.path.join(output_prefix, "subword-nmt-repo"), subword_nmt_job.out_repository) return subword_nmt_job.out_repository
def export(path_prefix): """ :param str path_prefix: """ ljspeech_22khz_bliss_corpus = get_22khz_bliss_corpus( create_alias_with_prefix=path_prefix ) ljspeech_16khz_bliss_corpus = get_16khz_bliss_corpus( create_alias_with_prefix=path_prefix ) ljspeech_sequitur_model = get_g2p(create_alias_with_prefix=path_prefix) tk.register_output( os.path.join( path_prefix, "LJSpeech", "ljspeech_22khz.xml.gz", ), ljspeech_22khz_bliss_corpus, ) tk.register_output( os.path.join( path_prefix, "LJSpeech", "ljspeech_16khz.xml.gz", ), ljspeech_16khz_bliss_corpus, ) tk.register_output( os.path.join(path_prefix, "LJSpeech", "ljspeech_sequitur_g2p.model"), ljspeech_sequitur_model, )
def build_subwords(bliss_corpora, num_segments, name): """ This function creates the subword codes and vocabulary files for a given bliss dataset :param list bliss_corpora: bliss corpus for subword training :param int num_segments: number of bpe merge operations / bpe segments :param str name: name of the subwords :return: """ corpus_texts = [] for bliss_corpus in bliss_corpora: extract_text_job = BlissExtractRawText(bliss_corpus) corpus_texts.append(extract_text_job.out) from recipe.text import Concatenate text = Concatenate(corpus_texts).out subwords_job = CreateSubwordsAndVocab(text=text, num_segments=num_segments) subwords_job.add_alias("data/subwords/CreateSubwordsAndVocab-%s" % name) bpe_codes = subwords_job.out_bpe bpe_vocab = subwords_job.out_vocab bpe_vocab_size = subwords_job.out_vocab_size tk.register_output("data/subwords/%s.bpe.codes" % name, bpe_codes) tk.register_output("data/subwords/%s.bpe.vocab" % name, bpe_vocab) tk.register_output("data/subwords/%s.bpe.vocab_size" % name, bpe_vocab_size) return bpe_codes, bpe_vocab, bpe_vocab_size
def get_bpe_settings(bliss_corpus, bpe_size, subword_nmt_repo_path, unk_label="UNK", output_prefix=""): """ :param Path bliss_corpus :param int bpe_size: :param Path subword_nmt_repo_path: :param str unk_label: :param str output_prefix :return: :rtype: BPESettings """ to_text_job = CorpusToTxtJob(bliss_corpus) to_text_job.add_alias(os.path.join(output_prefix, "bliss_to_text")) train_bpe_job = ReturnnTrainBpeJob(text_file=to_text_job.out_txt, bpe_size=bpe_size, unk_label=unk_label, subword_nmt_repo=subword_nmt_repo_path) train_bpe_job.add_alias(os.path.join(output_prefix, "train_bpe")) tk.register_output(os.path.join(output_prefix, "bpe.codes"), train_bpe_job.out_bpe_codes) tk.register_output(os.path.join(output_prefix, "bpe.vocab"), train_bpe_job.out_bpe_vocab) tk.register_output(os.path.join(output_prefix, "bpe.vocab.size"), train_bpe_job.out_vocab_size) return BPESettings(train_bpe_job.out_bpe_codes, train_bpe_job.out_bpe_vocab, train_bpe_job.out_vocab_size, unk_label)
def train_asr_config(config, name, parameter_dict=None): """ This function trains a RETURNN asr model, given the config and parameters :param config: :param name: :param parameter_dict: :return: """ asr_train_job = RETURNNTrainingFromFile(config, parameter_dict=parameter_dict, mem_rqmt=16) asr_train_job.add_alias("asr_training/" + name) # asr_train_job.rqmt['qsub_args'] = '-l qname=%s' % "*080*" asr_train_job.rqmt['time'] = 167 asr_train_job.rqmt['cpu'] = 8 tk.register_output("asr_training/" + name + "_model", asr_train_job.model_dir) tk.register_output("asr_training/" + name + "_training-scores", asr_train_job.learning_rates) return asr_train_job
def ctc_test_hdf(): ctc_lexicon = create_regular_lexicon() recog_args = get_default_recog_args() tk.register_output("experiments/librispeech_100_ctc/ctc_lexicon.xml", ctc_lexicon) # Test with feature dropout training_args = get_default_training_args() training_args = copy.deepcopy(training_args) training_args['keep_epochs'] = [ 40, 80, 120, 160, 200, 210, 220, 230, 240, 250 ] training_args['num_epochs'] = 250 training_args['use_hdf'] = True recog_args = copy.deepcopy(recog_args) recog_args.eval_epochs = [40, 80, 120, 160, 200, 210, 220, 230, 240, 250] system = CtcSystem( returnn_config=get_returnn_config(feature_dropout=True, stronger_specaug=True, dropout=0.2), default_training_args=training_args, recognition_args=recog_args, rasr_python_home= '/work/tools/asr/python/3.8.0_tf_2.3-v1-generic+cuda10.1', rasr_python_exe= '/work/tools/asr/python/3.8.0_tf_2.3-v1-generic+cuda10.1/bin/python', ) train_data, dev_data, test_data = get_corpus_data_inputs( delete_empty_orth=True) gs.ALIAS_AND_OUTPUT_SUBDIR = "experiments/librispeech/librispeech_100_ctc/ctc_test_hdf" system.init_system(rasr_init_args=rasr_args, train_data=train_data, dev_data=dev_data, test_data=test_data) system.run(("extract", "train", "recog")) gs.ALIAS_AND_OUTPUT_SUBDIR = ""
def decode_and_evaluate_asr_config(name, config_file, model_path, epoch, zip_corpus, text, parameter_dict, training_name=None): """ This function creates the RETURNN decoding/search job, converts the output into the format for scoring and computes the WER score :param str name: name of the decoding, usually the evaluation set name and decoding options :param Path config_file: training config or special decoding config file path :param Path model_path: .model_dir variable of the training job :param int|tk.Variable epoch: the epoch to select from the model folder :param Path zip_corpus: zip corpus for decoding :param Path text: text dictionary file for WER computation :param dict parameter_dict: network options :param str training_name: optional name of the trained model for alias and output naming :return: """ path_prefix = "asr_evaluation/" if training_name: path_prefix += training_name + "/" local_parameter_dict = { 'ext_eval_zip': zip_corpus, 'ext_decoding': True, 'ext_model': model_path, 'ext_load_epoch': epoch, } local_parameter_dict.update(parameter_dict) asr_recog_job = RETURNNSearchFromFile(config_file, parameter_dict=local_parameter_dict, mem_rqmt=12, time_rqmt=1, output_mode="py") # TODO: Remove, this is for SGE only asr_recog_job.rqmt['qsub_args'] = '-l qname=%s' % "*080*" asr_recog_job.add_alias(path_prefix + "search_%s/recognition" % name) tk.register_output(path_prefix + "search_%s/asr_out" % name, asr_recog_job.out) bpe_to_words_job = SearchBPEtoWords(asr_recog_job.out) bpe_to_words_job.add_alias(path_prefix + "search_%s/bpe_to_words" % name) tk.register_output(path_prefix + "search_%s/words_out" % name, bpe_to_words_job.out) wer_score_job = ReturnnScore(bpe_to_words_job.out, text) wer_score_job.add_alias(path_prefix + "search_%s/wer_scoring" % name) tk.register_output(path_prefix + "search_%s/WER" % name, wer_score_job.out) return wer_score_job.out
def _export_datasets(output_prefix): """ :param str output_prefix: """ # export all bliss corpora for audio_format in ["flac", "ogg", "wav"]: bliss_corpus_dict = get_bliss_corpus_dict( audio_format=audio_format, output_prefix=output_prefix ) for name, bliss_corpus in bliss_corpus_dict.items(): tk.register_output( os.path.join( output_prefix, "LibriSpeech", "%s-%s.xml.gz" % (name, audio_format) ), bliss_corpus, ) # export all ogg zip corpora ogg_corpus_dict = get_ogg_zip_dict(output_prefix=output_prefix) for name, ogg_corpus in ogg_corpus_dict.items(): tk.register_output( os.path.join(output_prefix, "LibriSpeech", "%s.ogg.zip" % name), ogg_corpus )
def ctc_test_legacy_network(): ctc_lexicon = create_regular_lexicon() tk.register_output("experiments/librispeech_100_ctc/ctc_lexicon.xml", ctc_lexicon) system = CtcSystem( returnn_config=get_returnn_config(use_legacy_network=True), default_training_args=get_default_training_args(), recognition_args=recog_args, rasr_python_home= '/work/tools/asr/python/3.8.0_tf_2.3-v1-generic+cuda10.1', rasr_python_exe= '/work/tools/asr/python/3.8.0_tf_2.3-v1-generic+cuda10.1/bin/python', ) train_data, dev_data, test_data = get_corpus_data_inputs() gs.ALIAS_AND_OUTPUT_SUBDIR = "experiments/librispeech/librispeech_100_ctc/ctc_test_legacy_network" system.init_system(rasr_init_args=rasr_args, train_data=train_data, dev_data=dev_data, test_data=test_data) #system.run(("extract", "train", "recog")) system.run(("extract", "train")) gs.ALIAS_AND_OUTPUT_SUBDIR = ""
def _export_lexicon_and_vocab(output_prefix): """ :param str output_prefix: """ lexicon_output_prefix = os.path.join(output_prefix, "LibriSpeech", "lexicon") # folded / without stress marker bliss_lexicon = get_bliss_lexicon( output_prefix=output_prefix, use_stress_marker=True ) tk.register_output( os.path.join(lexicon_output_prefix, "librispeech.lexicon.folded.xml.gz"), bliss_lexicon, ) g2p_lexicon_dict = get_g2p_augmented_bliss_lexicon_dict( use_stress_marker=True, output_prefix=output_prefix ) for k, lexicon in g2p_lexicon_dict.items(): tk.register_output( os.path.join( lexicon_output_prefix, "%s.lexicon_with_g2p.folded.xml.gz" % k ), lexicon, ) # with stress marker bliss_lexicon = get_bliss_lexicon( output_prefix=output_prefix, use_stress_marker=False ) tk.register_output( os.path.join(lexicon_output_prefix, "librispeech.lexicon.xml.gz"), bliss_lexicon, ) g2p_lexicon_dict = get_g2p_augmented_bliss_lexicon_dict( use_stress_marker=False, output_prefix=output_prefix ) for k, lexicon in g2p_lexicon_dict.items(): tk.register_output( os.path.join(lexicon_output_prefix, "%s.lexicon_with_g2p.xml.gz" % k), lexicon, )
def get_bpe_settings(bliss_corpus, bpe_size, subword_nmt_repo_path, unk_label="UNK", output_prefix=""): """ Creates a BPESettings object containing codec and vocab files based on the provided parameters. As this helper is targeted for ASR, it directly accepts a bliss_corpus as input for the BPE estimation :param Path bliss_corpus: bliss corpus xml as training data for the BPE estimation :param int bpe_size: size of the BPE merge operations :param Path subword_nmt_repo_path: path to the subword_nmt_repo, can be filled with the result of `get_returnn_subword_nmt` :param str unk_label: unknown label, this should in most cases only be used for training, but maybe someone needs it. :param str output_prefix: :return: Filled BPESettings object :rtype: BPESettings """ to_text_job = CorpusToTxtJob(bliss_corpus) to_text_job.add_alias(os.path.join(output_prefix, "bliss_to_text")) train_bpe_job = ReturnnTrainBpeJob(text_file=to_text_job.out_txt, bpe_size=bpe_size, unk_label=unk_label, subword_nmt_repo=subword_nmt_repo_path) train_bpe_job.add_alias(os.path.join(output_prefix, "train_bpe")) tk.register_output(os.path.join(output_prefix, "bpe.codes"), train_bpe_job.out_bpe_codes) tk.register_output(os.path.join(output_prefix, "bpe.vocab"), train_bpe_job.out_bpe_vocab) tk.register_output(os.path.join(output_prefix, "bpe.vocab.size"), train_bpe_job.out_vocab_size) return BPESettings(train_bpe_job.out_bpe_codes, train_bpe_job.out_bpe_vocab, train_bpe_job.out_vocab_size, unk_label)
def register_outputs(self, prefix): tk.register_output('%s.codes' % prefix, self.codes) tk.register_output('%s.vocab' % prefix, self.vocab)
def ctc_test_speaker_loss(): ctc_lexicon = create_regular_lexicon() recog_args = get_default_recog_args() tk.register_output("experiments/librispeech_100_ctc/ctc_lexicon.xml", ctc_lexicon) # common training and recog args training_args = get_default_training_args() training_args = copy.deepcopy(training_args) training_args['keep_epochs'] = [ 40, 80, 120, 160, 200, 210, 220, 230, 240, 250 ] training_args['num_epochs'] = 250 recog_args = copy.deepcopy(recog_args) recog_args.eval_epochs = [40, 80, 120, 160, 200, 210, 220, 230, 240, 250] # baseline with subsampling 0: system = CtcSystem( returnn_config=get_returnn_config(feature_dropout=True, stronger_specaug=True, dropout=0.1), default_training_args=training_args, recognition_args=recog_args, rasr_python_home= '/work/tools/asr/python/3.8.0_tf_2.3-v1-generic+cuda10.1', rasr_python_exe= '/work/tools/asr/python/3.8.0_tf_2.3-v1-generic+cuda10.1/bin/python', ) train_data, dev_data, test_data = get_corpus_data_inputs( delete_empty_orth=True) gs.ALIAS_AND_OUTPUT_SUBDIR = "experiments/librispeech/librispeech_100_ctc/ctc_test_nosub" system.init_system(rasr_init_args=rasr_args, train_data=train_data, dev_data=dev_data, test_data=test_data) system.run(("extract", "train", "recog")) gs.ALIAS_AND_OUTPUT_SUBDIR = "" # Test with feature dropout returnn_root = CloneGitRepositoryJob( "https://github.com/rwth-i6/returnn", commit="6dc85907ee92a874973c01eee2219abf6a21d853").out_repository for tts_scale in [0.5, 1.0, 5.0, 10.0]: training_args = copy.deepcopy(training_args) training_args['add_speaker_map'] = True training_args['returnn_root'] = returnn_root recog_args = copy.deepcopy(recog_args) recog_args.compile_exec = tk.Path( "/u/rossenbach/bin/returnn/returnn_tf2.3.4_mkl_generic_launcher.sh" ) recog_args.blas_lib = tk.Path( "/work/tools/asr/tensorflow/2.3.4-generic+cuda10.1+mkl/bazel_out/external/mkl_linux/lib/libmklml_intel.so" ) recog_args.eval_epochs = [ 40, 80, 120, 160, 200, 210, 220, 230, 240, 250 ] system = HackyTTSCTCSystem( returnn_config=get_returnn_config(feature_dropout=True, stronger_specaug=True, dropout=0.1, use_tts=tts_scale), default_training_args=training_args, recognition_args=recog_args, rasr_python_home= '/work/tools/asr/python/3.8.0_tf_2.3-v1-generic+cuda10.1', rasr_python_exe= '/work/tools/asr/python/3.8.0_tf_2.3-v1-generic+cuda10.1/bin/python', ) train_data, dev_data, test_data = get_corpus_data_inputs( delete_empty_orth=True) gs.ALIAS_AND_OUTPUT_SUBDIR = "experiments/librispeech/librispeech_100_ctc/ctc_test_speaker_scale_%.1f" % tts_scale system.init_system(rasr_init_args=rasr_args, train_data=train_data, dev_data=dev_data, test_data=test_data) system.run(("extract", "train", "recog")) test_align_args = { 'label_unit': 'phoneme', 'label_tree_args': { 'skip_silence': True, # no silence in tree 'lexicon_config': { 'filename': create_regular_lexicon(delete_empty_orth=True), 'normalize_pronunciation': False, } # adjust eow-monophone }, 'label_scorer_type': 'precomputed-log-posterior', 'label_scorer_args': { 'scale': 1.0, 'usePrior': True, 'priorScale': 0.5, 'extraArgs': { 'blank-label-index': 0, 'reduction_factors': 2, } }, "register_output": True, } system.nn_align("align", "train-clean-100", flow="gt", tf_checkpoint=system.tf_checkpoints["default"][250], pronunciation_scale=1.0, alignment_options={ 'label-pruning': 50, 'label-pruning-limit': 100000 }, **test_align_args) gs.ALIAS_AND_OUTPUT_SUBDIR = ""