def make_index(data_path): metadata_rel_path = os.path.join("meta", "esc50.csv") index = { "version": "2.0.0", "clips": {}, "metadata": { "esc50.csv": [ metadata_rel_path, md5(os.path.join(data_path, metadata_rel_path)), ] }, } audio_dir = os.path.join(data_path, "audio") wavfiles = glob.glob(os.path.join(audio_dir, "*.wav")) assert len(wavfiles) == 2000 for wf in wavfiles: clip_id = os.path.basename(wf).replace(".wav", "") index["clips"][clip_id] = { "audio": [ os.path.join("audio", os.path.basename(wf)), md5(wf), ] } with open(INDEX_PATH, "w") as fhandle: json.dump(index, fhandle, indent=2)
def make_index(data_path): metadata_rel_path = os.path.join("metadata", "UrbanSound8K.csv") index = { "version": "1.0", "clips": {}, "metadata": { "UrbanSound8K.csv": [ metadata_rel_path, md5(os.path.join(data_path, metadata_rel_path)), ] }, } for i in range(1, 11): fold_dir = os.path.join(data_path, "audio", "fold{}".format(i)) wavfiles = glob.glob(os.path.join(fold_dir, "*.wav")) for wf in wavfiles: clip_id = os.path.basename(wf).replace(".wav", "") index["clips"][clip_id] = { "audio": [ os.path.join("audio", "fold{}".format(i), os.path.basename(wf)), md5(wf), ] } with open(INDEX_PATH, "w") as fhandle: json.dump(index, fhandle, indent=2)
def make_index(data_path): index = { "version": "2.0", "clips": {}, "metadata": { "Metadata-EigenScape": [ "Metadata-EigenScape.csv", md5(os.path.join(data_path, "Metadata-EigenScape.csv")), ] } } # audio folder clips = glob.glob(os.path.join(data_path, "*.wav")) # Store clips for clip in clips: clip_id = os.path.basename(clip).replace(".wav", "") index["clips"][clip_id] = { "audio": [ os.path.join(os.path.basename(clip)), md5(clip), ] } with open(INDEX_PATH, "w") as fhandle: json.dump(index, fhandle, indent=2)
def make_index(data_path): rel_paths = { 'development': "TUT-sound-events-2017-development", 'evaluation': "TUT-sound-events-2017-evaluation" } index = { "version": "2.0", "clips": {}, } for dataset_type, relative_path in rel_paths.items(): audio_path = os.path.join(data_path, relative_path, "audio/street") wavfiles = glob.glob(os.path.join(audio_path, "*.wav")) for wf in wavfiles: clip_id = os.path.basename(wf).replace(".wav", "") ann_file = wf.replace("audio", "meta").replace(".wav", ".ann") assert os.path.isfile(ann_file) if dataset_type == 'development': non_verified_ann_file = ann_file.replace( "meta", "non_verified/meta") assert os.path.isfile(non_verified_ann_file) else: non_verified_ann_file = None index["clips"][clip_id] = { "audio": [ os.path.join( relative_path, "audio/street", os.path.basename(wf)), md5(wf), ], "annotations": [ os.path.join( relative_path, "meta/street", os.path.basename(ann_file)), md5(ann_file), ], "non_verified_annotations": [ os.path.join( relative_path, "non_verified/meta/street", os.path.basename(ann_file)), md5(non_verified_ann_file), ] if non_verified_ann_file is not None else [None, None] } with open(INDEX_PATH, "w") as fhandle: json.dump(index, fhandle, indent=2)
def make_dataset_index(dataset_data_path): audio_dir = os.path.join(dataset_data_path, "labelled") audio_files = sorted(glob.glob(os.path.join(audio_dir, "*", "*.flac"))) label_dir = os.path.join(dataset_data_path, "labels_public") track_ids = [os.path.normpath(f).split(os.sep)[-2:] for f in audio_files] label_files = [tid[-1].replace(".flac", ".csv") for tid in track_ids] #will be fixed in the next version label_files = [ "[b827eb7d576e][2020-08-03T23-32-11Z][manual][---][565a40f866f3d2804332ca7896a4c77d][93.csv" if "565a40f866f3d2804332ca7896a4c77d" in lf else lf for lf in label_files ] audio_files = [os.sep.join(tid) for tid in track_ids] # top-key level metadata metadata_path = "labelled_metadata_public.csv" metadata_checksum = md5(os.path.join(dataset_data_path, metadata_path)) index_metadata = { "metadata": { "spatiotemporal_metadata": (metadata_path, metadata_checksum), } } # top-key level tracks index_tracks = {} for i, tid in enumerate(tqdm(track_ids)): assert os.path.exists(os.path.join(audio_dir, audio_files[i])) assert os.path.exists(os.path.join(label_dir, label_files[i])) audio_checksum = md5(os.path.join(audio_dir, audio_files[i])) label_checksum = md5(os.path.join(label_dir, label_files[i])) index_tracks[tid[-1].replace(".flac", "")] = { "audio": (f"labelled/{audio_files[i]}", audio_checksum), "annotation": (f"labels_public/{label_files[i]}", label_checksum), } # top-key level version dataset_index = {"version": "1.0a"} # combine all in dataset index dataset_index.update(index_metadata) dataset_index.update({"clips": index_tracks}) with open(DATASET_INDEX_PATH, "w") as fhandle: json.dump(dataset_index, fhandle, indent=2)
def make_index(data_path): metadata_folder = "FSDnoisy18k.meta" audio_train_folder = "FSDnoisy18k.audio_train" audio_test_folder = "FSDnoisy18k.audio_test" index = { "version": "1.0", "clips": {}, "metadata": { # Groundtruth files "train": [ os.path.join(metadata_folder, "train.csv"), md5(os.path.join(data_path, metadata_folder, "train.csv")), ], "test": [ os.path.join(metadata_folder, "test.csv"), md5(os.path.join(data_path, metadata_folder, "test.csv")), ], }, } train_clips = glob.glob( os.path.join(data_path, audio_train_folder, "*.wav")) test_clips = glob.glob(os.path.join(data_path, audio_test_folder, "*.wav")) # Store train clips for clip in train_clips: clip_id = os.path.basename(clip).replace(".wav", "") index["clips"][clip_id] = { "audio": [ os.path.join(audio_train_folder, os.path.basename(clip)), md5(clip), ] } # Store test clips for clip in test_clips: clip_id = os.path.basename(clip).replace(".wav", "") index["clips"][clip_id] = { "audio": [ os.path.join(audio_test_folder, os.path.basename(clip)), md5(clip), ] } with open(INDEX_PATH, "w") as fhandle: json.dump(index, fhandle, indent=2)
def _index_event(index, data_path, formt, annotation_subset, subset): eval_long_filenames = _get_long_eval_filenames(data_path) annotation_path = os.path.join( data_path, annotation_subset, ) annotationfiles = glob.glob(os.path.join(annotation_path, "*.csv")) annotationfiles.sort() for af in annotationfiles: clip_id = '{}/{}'.format( formt + "_" + subset, eval_long_filenames[os.path.basename(af).replace(".csv", "")] if subset == "eval" else os.path.basename(af).replace(".csv", "")) index["clips"][clip_id]["events"] = [ os.path.join(annotation_subset, os.path.basename(af)), md5(af), ] return index
def _index_wav(index, data_path, formt, subset, split, site): audio_path = os.path.join( data_path, "{}_{}".format(formt, subset), "{}-{}-{}".format(subset, split, site), ) wavfiles = glob.glob(os.path.join(audio_path, "*.wav")) wavfiles.sort() for wf in wavfiles: clip_id = '{}_{}/{}-{}-{}/{}'.format( formt, subset, subset, split, site, os.path.basename(wf).replace(".wav", "")) index["clips"][clip_id] = { "audio": [ os.path.join("{}_{}".format(formt, subset), "{}-{}-{}".format(subset, split, site), os.path.basename(wf)), md5(wf), ], } return index
def _index_wav(index, data_path, formt, subset): eval_long_filenames = _get_long_eval_filenames(data_path) if subset == "dev": audio_path = os.path.join( data_path, formt + "_" + subset, ) elif subset == "eval": audio_path = os.path.join( data_path, "proj/asignal/DCASE2019/dataset", formt + "_" + subset, ) wavfiles = glob.glob(os.path.join(audio_path, "*.wav")) wavfiles.sort() for wf in wavfiles: clip_id = '{}/{}'.format( formt + "_" + subset, eval_long_filenames[os.path.basename(wf).replace(".wav", "")] if subset == "eval" else os.path.basename(wf).replace(".wav", "")) index["clips"][clip_id] = { "audio": [ os.path.join(*Path(wf).parts[5:]), md5(wf), ], } return index
def _index_wav(index, data_path, formt, subset): eval_long_filenames = _get_long_eval_filenames(data_path) audio_path = os.path.join( data_path, formt+"_"+subset, ) wavfiles = glob.glob(os.path.join(audio_path, "*.wav")) wavfiles.sort() for wf in wavfiles: clip_id = '{}/{}'.format( formt+"_"+subset, eval_long_filenames[os.path.basename(wf).replace(".wav","")] if subset=="eval" else os.path.basename(wf).replace(".wav","") ) index["clips"][clip_id] = { "audio": [ os.path.join(formt+"_"+subset, os.path.basename(wf)), md5(wf), ], } return index
def test_md5(mocker): audio_file = b"audio1234" expected_checksum = "6dc00d1bac757abe4ea83308dde68aab" mocker.patch("builtins.open", new=mocker.mock_open(read_data=audio_file)) md5_checksum = validate.md5("test_file_path") assert expected_checksum == md5_checksum
def make_dataset_index(dataset_data_path): annotation_dir = os.path.join(dataset_data_path, "annotation") annotation_files = glob.glob(os.path.join(annotation_dir, "*.lab")) track_ids = sorted( [os.path.basename(f).split(".")[0] for f in annotation_files]) # top-key level metadata metadata_checksum = md5(os.path.join(dataset_data_path, "id_mapping.txt")) index_metadata = { "metadata": { "id_mapping": ("id_mapping.txt", metadata_checksum) } } # top-key level tracks index_tracks = {} for track_id in track_ids: audio_checksum = md5( os.path.join(dataset_data_path, "Wavfile/{}.wav".format(track_id))) annotation_checksum = md5( os.path.join(dataset_data_path, "annotation/{}.lab".format(track_id))) index_tracks[track_id] = { "audio": ("Wavfile/{}.wav".format(track_id), audio_checksum), "annotation": ("annotation/{}.lab".format(track_id), annotation_checksum), } # top-key level version dataset_index = {"version": None} # combine all in dataset index dataset_index.update(index_metadata) dataset_index.update({"tracks": index_tracks}) with open(DATASET_INDEX_PATH, "w") as fhandle: json.dump(dataset_index, fhandle, indent=2)
def _index_event(index, data_path, formt, annotation_subset, subset, split, site): if subset == "dev": annotation_path = os.path.join(data_path, annotation_subset, "{}-{}-{}".format(subset, split, site)) elif subset == "eval": annotation_path = os.path.join( data_path, annotation_subset, ) annotationfiles = glob.glob(os.path.join(annotation_path, "*.csv")) annotationfiles.sort() for af in annotationfiles: clip_id = '{}_{}/{}-{}-{}/{}'.format( formt, subset, subset, split, site, os.path.basename(af).replace(".csv", "")) if subset == "dev": index["clips"][clip_id]["events"] = [ os.path.join(annotation_subset, "{}-{}-{}".format(subset, split, site), os.path.basename(af)), md5(af), ] elif subset == "eval": index["clips"][clip_id]["events"] = [ os.path.join(annotation_subset, os.path.basename(af)), md5(af), ] return index
def download_from_remote(remote, save_dir, force_overwrite): """Download a remote dataset into path Fetch a dataset pointed by remote's url, save into path using remote's filename and ensure its integrity based on the MD5 Checksum of the downloaded file. Adapted from scikit-learn's sklearn.datasets.base._fetch_remote. Args: remote (RemoteFileMetadata): Named tuple containing remote dataset meta information: url, filename and checksum save_dir (str): Directory to save the file to. Usually `data_home` force_overwrite (bool): If True, overwrite existing file with the downloaded file. If False, does not overwrite, but checks that checksum is consistent. Returns: str: Full path of the created file. """ if remote.destination_dir is None: download_dir = save_dir else: download_dir = os.path.join(save_dir, remote.destination_dir) if not os.path.exists(download_dir): os.makedirs(download_dir) download_path = os.path.join(download_dir, remote.filename) if not os.path.exists(download_path) or force_overwrite: # if we got here, we want to overwrite any existing file if os.path.exists(download_path): os.remove(download_path) # If file doesn't exist or we want to overwrite, download it with DownloadProgressBar(unit="B", unit_scale=True, unit_divisor=1024, miniters=1) as t: try: urllib.request.urlretrieve( remote.url, filename=download_path, reporthook=t.update_to, data=None, ) except Exception as exc: error_msg = """ soundata failed to download the dataset from {}! Please try again in a few minutes. If this error persists, please raise an issue at https://github.com/soundata/soundata, and tag it with 'broken-link'. """.format(remote.url) logging.error(error_msg) raise exc else: logging.info( "{} already exists and will not be downloaded. ".format( download_path) + "Rerun with force_overwrite=True to delete this file and force the download." ) checksum = md5(download_path) if remote.checksum != checksum: raise IOError("{} has an MD5 checksum ({}) " "differing from expected ({}), " "file may be corrupted.".format(download_path, checksum, remote.checksum)) return download_path
def make_index(data_path): rel_paths = { "development": "TAU-urban-acoustic-scenes-2020-mobile-development", "evaluation": "TAU-urban-acoustic-scenes-2020-mobile-evaluation", "leaderboard": "TAU-urban-acoustic-scenes-2020-mobile-leaderboard", } metadata_rel_path = os.path.join(rel_paths["development"], "meta.csv") setup_paths = {} for dataset_type in rel_paths.keys(): setup_paths[dataset_type] = os.path.join(rel_paths[dataset_type], "evaluation_setup") index = { "version": "2.0", "clips": {}, "metadata": { "meta.csv": [ metadata_rel_path, md5(os.path.join(data_path, metadata_rel_path)), ], "fold1_evaluate.csv": [ os.path.join(setup_paths["development"], "fold1_evaluate.csv"), md5( os.path.join(data_path, setup_paths["development"], "fold1_evaluate.csv")), ], "fold1_test.csv": [ os.path.join(setup_paths["development"], "fold1_test.csv"), md5( os.path.join(data_path, setup_paths["development"], "fold1_test.csv")), ], "fold1_train.csv": [ os.path.join(setup_paths["development"], "fold1_train.csv"), md5( os.path.join(data_path, setup_paths["development"], "fold1_train.csv")), ], "evaluation/fold1_test.csv": [ os.path.join(setup_paths["evaluation"], "fold1_test.csv"), md5( os.path.join(data_path, setup_paths["evaluation"], "fold1_test.csv")), ], }, } for relative_path in rel_paths.values(): audio_path = os.path.join(data_path, relative_path, "audio") wavfiles = glob.glob(os.path.join(audio_path, "*.wav")) for wf in wavfiles: clip_id = os.path.basename(wf).replace(".wav", "") index["clips"][clip_id] = { "audio": [ os.path.join(relative_path, "audio", os.path.basename(wf)), md5(wf), ] } with open(INDEX_PATH, "w") as fhandle: json.dump(index, fhandle, indent=2)
def make_index(data_path): rel_paths = { 'impulse_response': [ "3D-MARCo Impulse Responses/01_Speaker_+90deg_3m", "3D-MARCo Impulse Responses/02_Speaker_+75deg_4m", "3D-MARCo Impulse Responses/03_Speaker_+60deg_3m", "3D-MARCo Impulse Responses/04_Speaker_+45deg_4m", "3D-MARCo Impulse Responses/05_Speaker_+30deg_3m", "3D-MARCo Impulse Responses/06_Speaker_+15deg_4m", "3D-MARCo Impulse Responses/07_Speaker_0deg_3m", "3D-MARCo Impulse Responses/08_Speaker_-15deg_4m", "3D-MARCo Impulse Responses/09_Speaker_-30deg_3m", "3D-MARCo Impulse Responses/10_Speaker_-45deg_4m", "3D-MARCo Impulse Responses/11_Speaker_-60deg_3m", "3D-MARCo Impulse Responses/12_Speaker_-75deg_4m", "3D-MARCo Impulse Responses/13_Speaker_-90deg_3m", ], 'acapella': ["Acappella"], 'organ': ["Organ"], 'piano_solo_1': ["Piano solo 1"], 'piano_solo_2': ["Piano solo 2"], 'quartet': ["Quartet"], 'single_sources': [ "Single sources at different positions/01_0deg", "Single sources at different positions/02_-15deg", "Single sources at different positions/03_-30deg", "Single sources at different positions/04_-45deg", "Single sources at different positions/05_-60deg", "Single sources at different positions/06_-75deg", "Single sources at different positions/07_-90deg", ], 'trio': ["Trio"], } index = { "version": "1.0.1", "clips": {}, } for source_type, paths in rel_paths.items(): for path in paths: audio_path = os.path.join(data_path, path) wavfiles = glob.glob(os.path.join(audio_path, "*.wav")) for wf in wavfiles: clip_id = "{}/{}".format( source_type, os.path.basename(wf).replace(".wav", "")) index["clips"][clip_id] = { "audio": [os.path.join(path, os.path.basename(wf)), md5(wf)] } with open(INDEX_PATH, "w") as fhandle: json.dump(index, fhandle, indent=2)
def make_index(data_path): metadata_folder = "FSD50K.metadata" ground_truth_folder = "FSD50K.ground_truth" index = { "version": "1.0", "clips": {}, "metadata": { # Groundtruth files "dev_ground_truth": [ os.path.join(ground_truth_folder, "dev.csv"), md5(os.path.join(data_path, ground_truth_folder, "dev.csv")), ], "eval_ground_truth": [ os.path.join(ground_truth_folder, "eval.csv"), md5(os.path.join(data_path, ground_truth_folder, "eval.csv")), ], # List of FSD50K sound classes "vocabulary": [ os.path.join(ground_truth_folder, "vocabulary.csv"), md5( os.path.join(data_path, ground_truth_folder, "vocabulary.csv")), ], # Additional metadata "dev_clips_info": [ os.path.join(metadata_folder, "dev_clips_info_FSD50K.json"), md5( os.path.join(data_path, metadata_folder, "dev_clips_info_FSD50K.json")), ], "eval_clips_info": [ os.path.join(metadata_folder, "eval_clips_info_FSD50K.json"), md5( os.path.join(data_path, metadata_folder, "eval_clips_info_FSD50K.json")), ], # Relevant info about the labels "class_info": [ os.path.join(metadata_folder, "class_info_FSD50K.json"), md5( os.path.join(data_path, metadata_folder, "class_info_FSD50K.json")), ], # PP/PNP ratings "pp_pnp_ratings": [ os.path.join(metadata_folder, "pp_pnp_ratings_FSD50K.json"), md5( os.path.join(data_path, metadata_folder, "pp_pnp_ratings_FSD50K.json")), ], }, } # Development audio folder dev_audio_dir = "FSD50K.dev_audio" eval_audio_dir = "FSD50K.eval_audio" dev_clips = glob.glob(os.path.join(data_path, dev_audio_dir, "*.wav")) eval_clips = glob.glob(os.path.join(data_path, eval_audio_dir, "*.wav")) # Store development clips for clip in dev_clips: clip_id = os.path.basename(clip).replace(".wav", "") index["clips"][clip_id] = { "audio": [ os.path.join(dev_audio_dir, os.path.basename(clip)), md5(clip), ] } # Store evaluation clips for clip in eval_clips: clip_id = os.path.basename(clip).replace(".wav", "") index["clips"][clip_id] = { "audio": [ os.path.join(eval_audio_dir, os.path.basename(clip)), md5(clip), ] } with open(INDEX_PATH, "w") as fhandle: json.dump(index, fhandle, indent=2)