def build_test_dataset(dataset_key, returnn_python_exe, returnn_root, output_path, bpe_size=2000): ogg_zip_dict = get_ogg_zip_dict("corpora") bliss_dict = get_bliss_corpus_dict() test_ogg = ogg_zip_dict[dataset_key] from i6_core.corpus.convert import CorpusToTextDictJob test_reference_dict_file = CorpusToTextDictJob( bliss_dict[dataset_key]).out_dictionary train_bpe_datastream = get_bpe_datastream(bpe_size=bpe_size, is_recog=True) audio_datastream = get_audio_datastream(returnn_python_exe, returnn_root, output_path) data_map = { "audio_features": ("zip_dataset", "data"), "bpe_labels": ("zip_dataset", "classes") } test_zip_dataset = returnn_standalone.data.datasets.OggZipDataset( path=[test_ogg], audio_opts=audio_datastream.as_returnn_audio_opts(), target_opts=train_bpe_datastream.as_returnn_targets_opts(), seq_ordering="sorted_reverse") test_dataset = returnn_standalone.data.datasets.MetaDataset( data_map=data_map, datasets={"zip_dataset": test_zip_dataset}, seq_order_control_dataset="zip_dataset") return test_dataset, test_reference_dict_file
def get_audio_datastream(returnn_python_exe, returnn_root, output_path): ogg_zip_dict = get_ogg_zip_dict("corpora") train_clean_100_ogg = ogg_zip_dict['train-clean-100'] audio_datastream = get_default_asr_audio_datastream( statistics_ogg_zip=train_clean_100_ogg, returnn_python_exe=returnn_python_exe, returnn_root=returnn_root, output_path=output_path, ) return audio_datastream
def build_training_datasets(returnn_python_exe, returnn_root, output_path): bpe_size = 2000 ogg_zip_dict = get_ogg_zip_dict("corpora") train_clean_100_ogg = ogg_zip_dict['train-clean-100'] dev_clean_ogg = ogg_zip_dict['dev-clean'] dev_other_ogg = ogg_zip_dict['dev-other'] train_bpe_datastream = get_bpe_datastream(bpe_size=bpe_size, is_recog=False) audio_datastream = get_audio_datastream( statistics_ogg_zip=train_clean_100_ogg, returnn_python_exe=returnn_python_exe, returnn_root=returnn_root, output_path=output_path, ) extern_data = { 'audio_features': audio_datastream.as_returnn_data_opts(), 'bpe_labels': train_bpe_datastream.as_returnn_data_opts() } data_map = { "audio_features": ("zip_dataset", "data"), "bpe_labels": ("zip_dataset", "classes") } train_zip_dataset = returnn_standalone.data.datasets.OggZipDataset( path=train_clean_100_ogg, audio_opts=audio_datastream.as_returnn_audio_opts(), target_opts=train_bpe_datastream.as_returnn_targets_opts(), partition_epoch=3, seq_ordering="laplace:.1000", other_opts={"epoch_wise_filter": { (1, 5): { "max_mean_len": 1000 } }} # still hardcoded, future work ) train_dataset = returnn_standalone.data.datasets.MetaDataset( data_map=data_map, datasets={"zip_dataset": train_zip_dataset}, seq_order_control_dataset="zip_dataset") from i6_experiments.users.rossenbach.datasets.librispeech import get_mixed_cv_segments cv_zip_dataset = returnn_standalone.data.datasets.OggZipDataset( path=[dev_clean_ogg, dev_other_ogg], audio_opts=audio_datastream.as_returnn_audio_opts(), target_opts=train_bpe_datastream.as_returnn_targets_opts(), segment_file=get_mixed_cv_segments(), seq_ordering="sorted_reverse") cv_dataset = returnn_standalone.data.datasets.MetaDataset( data_map=data_map, datasets={"zip_dataset": cv_zip_dataset}, seq_order_control_dataset="zip_dataset") devtrain_zip_dataset = returnn_standalone.data.datasets.OggZipDataset( path=train_clean_100_ogg, audio_opts=audio_datastream.as_returnn_audio_opts(), target_opts=train_bpe_datastream.as_returnn_targets_opts(), segment_file=get_mixed_cv_segments(), seq_ordering="sorted_reverse", subset=3000, ) devtrain_dataset = returnn_standalone.data.datasets.MetaDataset( data_map=data_map, datasets={"zip_dataset": devtrain_zip_dataset}, seq_order_control_dataset="zip_dataset") return train_dataset, cv_dataset, devtrain_dataset, extern_data