Пример #1
0
def make_index(data_path):

    metadata_rel_path = os.path.join("meta", "esc50.csv")

    index = {
        "version": "2.0.0",
        "clips": {},
        "metadata": {
            "esc50.csv": [
                metadata_rel_path,
                md5(os.path.join(data_path, metadata_rel_path)),
            ]
        },
    }

    audio_dir = os.path.join(data_path, "audio")
    wavfiles = glob.glob(os.path.join(audio_dir, "*.wav"))
    assert len(wavfiles) == 2000

    for wf in wavfiles:

        clip_id = os.path.basename(wf).replace(".wav", "")
        index["clips"][clip_id] = {
            "audio": [
                os.path.join("audio", os.path.basename(wf)),
                md5(wf),
            ]
        }

    with open(INDEX_PATH, "w") as fhandle:
        json.dump(index, fhandle, indent=2)
Пример #2
0
def make_index(data_path):

    metadata_rel_path = os.path.join("metadata", "UrbanSound8K.csv")

    index = {
        "version": "1.0",
        "clips": {},
        "metadata": {
            "UrbanSound8K.csv": [
                metadata_rel_path,
                md5(os.path.join(data_path, metadata_rel_path)),
            ]
        },
    }

    for i in range(1, 11):

        fold_dir = os.path.join(data_path, "audio", "fold{}".format(i))

        wavfiles = glob.glob(os.path.join(fold_dir, "*.wav"))

        for wf in wavfiles:

            clip_id = os.path.basename(wf).replace(".wav", "")
            index["clips"][clip_id] = {
                "audio": [
                    os.path.join("audio", "fold{}".format(i),
                                 os.path.basename(wf)),
                    md5(wf),
                ]
            }

    with open(INDEX_PATH, "w") as fhandle:
        json.dump(index, fhandle, indent=2)
Пример #3
0
def make_index(data_path):

    index = {
        "version": "2.0",
        "clips": {},
        "metadata": {
            "Metadata-EigenScape": [
                "Metadata-EigenScape.csv",
                md5(os.path.join(data_path, "Metadata-EigenScape.csv")),
            ]
        }
    }

    # audio folder
    clips = glob.glob(os.path.join(data_path, "*.wav"))

    # Store clips
    for clip in clips:
        clip_id = os.path.basename(clip).replace(".wav", "")
        index["clips"][clip_id] = {
            "audio": [
                os.path.join(os.path.basename(clip)),
                md5(clip),
            ]
        }

    with open(INDEX_PATH, "w") as fhandle:
        json.dump(index, fhandle, indent=2)
Пример #4
0
def make_index(data_path):

    rel_paths = {
        'development': "TUT-sound-events-2017-development",
        'evaluation': "TUT-sound-events-2017-evaluation"
    }

    index = {
        "version": "2.0",
        "clips": {},
    }

    for dataset_type, relative_path in rel_paths.items():

        audio_path = os.path.join(data_path, relative_path, "audio/street")

        wavfiles = glob.glob(os.path.join(audio_path, "*.wav"))

        for wf in wavfiles:

            clip_id = os.path.basename(wf).replace(".wav", "")

            ann_file = wf.replace("audio", "meta").replace(".wav", ".ann")

            assert os.path.isfile(ann_file)

            if dataset_type == 'development':
                non_verified_ann_file = ann_file.replace(
                    "meta", "non_verified/meta")
                assert os.path.isfile(non_verified_ann_file)
            else:
                non_verified_ann_file = None

            index["clips"][clip_id] = {
                "audio": [
                    os.path.join(
                        relative_path, "audio/street",
                        os.path.basename(wf)),
                    md5(wf),
                ],
                "annotations": [
                    os.path.join(
                        relative_path, "meta/street",
                        os.path.basename(ann_file)),
                    md5(ann_file),
                ],
                "non_verified_annotations": [
                    os.path.join(
                        relative_path, "non_verified/meta/street",
                        os.path.basename(ann_file)),
                    md5(non_verified_ann_file),
                ] if non_verified_ann_file is not None else [None, None]
            }

    with open(INDEX_PATH, "w") as fhandle:
        json.dump(index, fhandle, indent=2)
Пример #5
0
def make_dataset_index(dataset_data_path):
    audio_dir = os.path.join(dataset_data_path, "labelled")
    audio_files = sorted(glob.glob(os.path.join(audio_dir, "*", "*.flac")))

    label_dir = os.path.join(dataset_data_path, "labels_public")

    track_ids = [os.path.normpath(f).split(os.sep)[-2:] for f in audio_files]
    label_files = [tid[-1].replace(".flac", ".csv") for tid in track_ids]
    #will be fixed in the next version
    label_files = [
        "[b827eb7d576e][2020-08-03T23-32-11Z][manual][---][565a40f866f3d2804332ca7896a4c77d][93.csv"
        if "565a40f866f3d2804332ca7896a4c77d" in lf
        else lf
        for lf in label_files
    ]
    audio_files = [os.sep.join(tid) for tid in track_ids]

    # top-key level metadata

    metadata_path = "labelled_metadata_public.csv"
    metadata_checksum = md5(os.path.join(dataset_data_path, metadata_path))

    index_metadata = {
        "metadata": {
            "spatiotemporal_metadata": (metadata_path, metadata_checksum),
        }
    }

    # top-key level tracks
    index_tracks = {}
    for i, tid in enumerate(tqdm(track_ids)):

        assert os.path.exists(os.path.join(audio_dir, audio_files[i]))
        assert os.path.exists(os.path.join(label_dir, label_files[i]))

        audio_checksum = md5(os.path.join(audio_dir, audio_files[i]))

        label_checksum = md5(os.path.join(label_dir, label_files[i]))

        index_tracks[tid[-1].replace(".flac", "")] = {
            "audio": (f"labelled/{audio_files[i]}", audio_checksum),
            "annotation": (f"labels_public/{label_files[i]}", label_checksum),
        }

    # top-key level version
    dataset_index = {"version": "1.0a"}

    # combine all in dataset index
    dataset_index.update(index_metadata)
    dataset_index.update({"clips": index_tracks})

    with open(DATASET_INDEX_PATH, "w") as fhandle:
        json.dump(dataset_index, fhandle, indent=2)
Пример #6
0
def make_index(data_path):

    metadata_folder = "FSDnoisy18k.meta"
    audio_train_folder = "FSDnoisy18k.audio_train"
    audio_test_folder = "FSDnoisy18k.audio_test"

    index = {
        "version": "1.0",
        "clips": {},
        "metadata": {
            # Groundtruth files
            "train": [
                os.path.join(metadata_folder, "train.csv"),
                md5(os.path.join(data_path, metadata_folder, "train.csv")),
            ],
            "test": [
                os.path.join(metadata_folder, "test.csv"),
                md5(os.path.join(data_path, metadata_folder, "test.csv")),
            ],
        },
    }

    train_clips = glob.glob(
        os.path.join(data_path, audio_train_folder, "*.wav"))
    test_clips = glob.glob(os.path.join(data_path, audio_test_folder, "*.wav"))

    # Store train clips
    for clip in train_clips:
        clip_id = os.path.basename(clip).replace(".wav", "")
        index["clips"][clip_id] = {
            "audio": [
                os.path.join(audio_train_folder, os.path.basename(clip)),
                md5(clip),
            ]
        }

    # Store test clips
    for clip in test_clips:
        clip_id = os.path.basename(clip).replace(".wav", "")
        index["clips"][clip_id] = {
            "audio": [
                os.path.join(audio_test_folder, os.path.basename(clip)),
                md5(clip),
            ]
        }

    with open(INDEX_PATH, "w") as fhandle:
        json.dump(index, fhandle, indent=2)
Пример #7
0
def _index_event(index, data_path, formt, annotation_subset, subset):

    eval_long_filenames = _get_long_eval_filenames(data_path)

    annotation_path = os.path.join(
        data_path,
        annotation_subset,
    )

    annotationfiles = glob.glob(os.path.join(annotation_path, "*.csv"))
    annotationfiles.sort()

    for af in annotationfiles:

        clip_id = '{}/{}'.format(
            formt + "_" + subset,
            eval_long_filenames[os.path.basename(af).replace(".csv", "")]
            if subset == "eval" else os.path.basename(af).replace(".csv", ""))

        index["clips"][clip_id]["events"] = [
            os.path.join(annotation_subset, os.path.basename(af)),
            md5(af),
        ]

    return index
Пример #8
0
def _index_wav(index, data_path, formt, subset, split, site):

    audio_path = os.path.join(
        data_path,
        "{}_{}".format(formt, subset),
        "{}-{}-{}".format(subset, split, site),
    )

    wavfiles = glob.glob(os.path.join(audio_path, "*.wav"))
    wavfiles.sort()

    for wf in wavfiles:

        clip_id = '{}_{}/{}-{}-{}/{}'.format(
            formt, subset, subset, split, site,
            os.path.basename(wf).replace(".wav", ""))

        index["clips"][clip_id] = {
            "audio": [
                os.path.join("{}_{}".format(formt, subset),
                             "{}-{}-{}".format(subset, split, site),
                             os.path.basename(wf)),
                md5(wf),
            ],
        }

    return index
Пример #9
0
def _index_wav(index, data_path, formt, subset):

    eval_long_filenames = _get_long_eval_filenames(data_path)

    if subset == "dev":
        audio_path = os.path.join(
            data_path,
            formt + "_" + subset,
        )
    elif subset == "eval":
        audio_path = os.path.join(
            data_path,
            "proj/asignal/DCASE2019/dataset",
            formt + "_" + subset,
        )

    wavfiles = glob.glob(os.path.join(audio_path, "*.wav"))
    wavfiles.sort()

    for wf in wavfiles:

        clip_id = '{}/{}'.format(
            formt + "_" + subset,
            eval_long_filenames[os.path.basename(wf).replace(".wav", "")]
            if subset == "eval" else os.path.basename(wf).replace(".wav", ""))

        index["clips"][clip_id] = {
            "audio": [
                os.path.join(*Path(wf).parts[5:]),
                md5(wf),
            ],
        }

    return index
def _index_wav(index, data_path, formt, subset):

    eval_long_filenames = _get_long_eval_filenames(data_path)

    audio_path = os.path.join(
            data_path,
            formt+"_"+subset,
    )

    wavfiles = glob.glob(os.path.join(audio_path, "*.wav"))
    wavfiles.sort()

    for wf in wavfiles:

        clip_id = '{}/{}'.format(
                formt+"_"+subset,
                eval_long_filenames[os.path.basename(wf).replace(".wav","")] if subset=="eval" else os.path.basename(wf).replace(".wav","")
        )

        index["clips"][clip_id] = {
            "audio": [
                os.path.join(formt+"_"+subset, os.path.basename(wf)),
                md5(wf),
            ],
        }

    return index
Пример #11
0
def test_md5(mocker):
    audio_file = b"audio1234"

    expected_checksum = "6dc00d1bac757abe4ea83308dde68aab"

    mocker.patch("builtins.open", new=mocker.mock_open(read_data=audio_file))

    md5_checksum = validate.md5("test_file_path")
    assert expected_checksum == md5_checksum
Пример #12
0
def make_dataset_index(dataset_data_path):
    annotation_dir = os.path.join(dataset_data_path, "annotation")
    annotation_files = glob.glob(os.path.join(annotation_dir, "*.lab"))
    track_ids = sorted(
        [os.path.basename(f).split(".")[0] for f in annotation_files])

    # top-key level metadata
    metadata_checksum = md5(os.path.join(dataset_data_path, "id_mapping.txt"))
    index_metadata = {
        "metadata": {
            "id_mapping": ("id_mapping.txt", metadata_checksum)
        }
    }

    # top-key level tracks
    index_tracks = {}
    for track_id in track_ids:
        audio_checksum = md5(
            os.path.join(dataset_data_path, "Wavfile/{}.wav".format(track_id)))
        annotation_checksum = md5(
            os.path.join(dataset_data_path,
                         "annotation/{}.lab".format(track_id)))

        index_tracks[track_id] = {
            "audio": ("Wavfile/{}.wav".format(track_id), audio_checksum),
            "annotation":
            ("annotation/{}.lab".format(track_id), annotation_checksum),
        }

    # top-key level version
    dataset_index = {"version": None}

    # combine all in dataset index
    dataset_index.update(index_metadata)
    dataset_index.update({"tracks": index_tracks})

    with open(DATASET_INDEX_PATH, "w") as fhandle:
        json.dump(dataset_index, fhandle, indent=2)
Пример #13
0
def _index_event(index, data_path, formt, annotation_subset, subset, split,
                 site):

    if subset == "dev":
        annotation_path = os.path.join(data_path, annotation_subset,
                                       "{}-{}-{}".format(subset, split, site))
    elif subset == "eval":
        annotation_path = os.path.join(
            data_path,
            annotation_subset,
        )

    annotationfiles = glob.glob(os.path.join(annotation_path, "*.csv"))
    annotationfiles.sort()

    for af in annotationfiles:

        clip_id = '{}_{}/{}-{}-{}/{}'.format(
            formt, subset, subset, split, site,
            os.path.basename(af).replace(".csv", ""))
        if subset == "dev":

            index["clips"][clip_id]["events"] = [
                os.path.join(annotation_subset,
                             "{}-{}-{}".format(subset, split, site),
                             os.path.basename(af)),
                md5(af),
            ]
        elif subset == "eval":

            index["clips"][clip_id]["events"] = [
                os.path.join(annotation_subset, os.path.basename(af)),
                md5(af),
            ]

    return index
Пример #14
0
def download_from_remote(remote, save_dir, force_overwrite):
    """Download a remote dataset into path
    Fetch a dataset pointed by remote's url, save into path using remote's
    filename and ensure its integrity based on the MD5 Checksum of the
    downloaded file.

    Adapted from scikit-learn's sklearn.datasets.base._fetch_remote.

    Args:
        remote (RemoteFileMetadata): Named tuple containing remote dataset
            meta information: url, filename and checksum
        save_dir (str): Directory to save the file to. Usually `data_home`
        force_overwrite  (bool):
            If True, overwrite existing file with the downloaded file.
            If False, does not overwrite, but checks that checksum is consistent.

    Returns:
        str: Full path of the created file.

    """
    if remote.destination_dir is None:
        download_dir = save_dir
    else:
        download_dir = os.path.join(save_dir, remote.destination_dir)

    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    download_path = os.path.join(download_dir, remote.filename)

    if not os.path.exists(download_path) or force_overwrite:
        # if we got here, we want to overwrite any existing file
        if os.path.exists(download_path):
            os.remove(download_path)

        # If file doesn't exist or we want to overwrite, download it
        with DownloadProgressBar(unit="B",
                                 unit_scale=True,
                                 unit_divisor=1024,
                                 miniters=1) as t:
            try:
                urllib.request.urlretrieve(
                    remote.url,
                    filename=download_path,
                    reporthook=t.update_to,
                    data=None,
                )
            except Exception as exc:
                error_msg = """
                            soundata failed to download the dataset from {}!
                            Please try again in a few minutes.
                            If this error persists, please raise an issue at
                            https://github.com/soundata/soundata,
                            and tag it with 'broken-link'.
                            """.format(remote.url)
                logging.error(error_msg)
                raise exc
    else:
        logging.info(
            "{} already exists and will not be downloaded. ".format(
                download_path) +
            "Rerun with force_overwrite=True to delete this file and force the download."
        )

    checksum = md5(download_path)
    if remote.checksum != checksum:

        raise IOError("{} has an MD5 checksum ({}) "
                      "differing from expected ({}), "
                      "file may be corrupted.".format(download_path, checksum,
                                                      remote.checksum))
    return download_path
def make_index(data_path):

    rel_paths = {
        "development": "TAU-urban-acoustic-scenes-2020-mobile-development",
        "evaluation": "TAU-urban-acoustic-scenes-2020-mobile-evaluation",
        "leaderboard": "TAU-urban-acoustic-scenes-2020-mobile-leaderboard",
    }

    metadata_rel_path = os.path.join(rel_paths["development"], "meta.csv")

    setup_paths = {}
    for dataset_type in rel_paths.keys():
        setup_paths[dataset_type] = os.path.join(rel_paths[dataset_type],
                                                 "evaluation_setup")

    index = {
        "version": "2.0",
        "clips": {},
        "metadata": {
            "meta.csv": [
                metadata_rel_path,
                md5(os.path.join(data_path, metadata_rel_path)),
            ],
            "fold1_evaluate.csv": [
                os.path.join(setup_paths["development"], "fold1_evaluate.csv"),
                md5(
                    os.path.join(data_path, setup_paths["development"],
                                 "fold1_evaluate.csv")),
            ],
            "fold1_test.csv": [
                os.path.join(setup_paths["development"], "fold1_test.csv"),
                md5(
                    os.path.join(data_path, setup_paths["development"],
                                 "fold1_test.csv")),
            ],
            "fold1_train.csv": [
                os.path.join(setup_paths["development"], "fold1_train.csv"),
                md5(
                    os.path.join(data_path, setup_paths["development"],
                                 "fold1_train.csv")),
            ],
            "evaluation/fold1_test.csv": [
                os.path.join(setup_paths["evaluation"], "fold1_test.csv"),
                md5(
                    os.path.join(data_path, setup_paths["evaluation"],
                                 "fold1_test.csv")),
            ],
        },
    }

    for relative_path in rel_paths.values():

        audio_path = os.path.join(data_path, relative_path, "audio")

        wavfiles = glob.glob(os.path.join(audio_path, "*.wav"))

        for wf in wavfiles:

            clip_id = os.path.basename(wf).replace(".wav", "")

            index["clips"][clip_id] = {
                "audio": [
                    os.path.join(relative_path, "audio", os.path.basename(wf)),
                    md5(wf),
                ]
            }

    with open(INDEX_PATH, "w") as fhandle:
        json.dump(index, fhandle, indent=2)
Пример #16
0
def make_index(data_path):

    rel_paths = {
        'impulse_response': [
            "3D-MARCo Impulse Responses/01_Speaker_+90deg_3m",
            "3D-MARCo Impulse Responses/02_Speaker_+75deg_4m",
            "3D-MARCo Impulse Responses/03_Speaker_+60deg_3m",
            "3D-MARCo Impulse Responses/04_Speaker_+45deg_4m",
            "3D-MARCo Impulse Responses/05_Speaker_+30deg_3m",
            "3D-MARCo Impulse Responses/06_Speaker_+15deg_4m",
            "3D-MARCo Impulse Responses/07_Speaker_0deg_3m",
            "3D-MARCo Impulse Responses/08_Speaker_-15deg_4m",
            "3D-MARCo Impulse Responses/09_Speaker_-30deg_3m",
            "3D-MARCo Impulse Responses/10_Speaker_-45deg_4m",
            "3D-MARCo Impulse Responses/11_Speaker_-60deg_3m",
            "3D-MARCo Impulse Responses/12_Speaker_-75deg_4m",
            "3D-MARCo Impulse Responses/13_Speaker_-90deg_3m",
        ],
        'acapella': ["Acappella"],
        'organ': ["Organ"],
        'piano_solo_1': ["Piano solo 1"],
        'piano_solo_2': ["Piano solo 2"],
        'quartet': ["Quartet"],
        'single_sources': [
            "Single sources at different positions/01_0deg",
            "Single sources at different positions/02_-15deg",
            "Single sources at different positions/03_-30deg",
            "Single sources at different positions/04_-45deg",
            "Single sources at different positions/05_-60deg",
            "Single sources at different positions/06_-75deg",
            "Single sources at different positions/07_-90deg",
        ],
        'trio': ["Trio"],
    }

    index = {
        "version": "1.0.1",
        "clips": {},
    }

    for source_type, paths in rel_paths.items():

        for path in paths:

            audio_path = os.path.join(data_path, path)
            wavfiles = glob.glob(os.path.join(audio_path, "*.wav"))

            for wf in wavfiles:

                clip_id = "{}/{}".format(
                    source_type,
                    os.path.basename(wf).replace(".wav", ""))

                index["clips"][clip_id] = {
                    "audio":
                    [os.path.join(path, os.path.basename(wf)),
                     md5(wf)]
                }

    with open(INDEX_PATH, "w") as fhandle:
        json.dump(index, fhandle, indent=2)
Пример #17
0
def make_index(data_path):

    metadata_folder = "FSD50K.metadata"
    ground_truth_folder = "FSD50K.ground_truth"

    index = {
        "version": "1.0",
        "clips": {},
        "metadata": {
            # Groundtruth files
            "dev_ground_truth": [
                os.path.join(ground_truth_folder, "dev.csv"),
                md5(os.path.join(data_path, ground_truth_folder, "dev.csv")),
            ],
            "eval_ground_truth": [
                os.path.join(ground_truth_folder, "eval.csv"),
                md5(os.path.join(data_path, ground_truth_folder, "eval.csv")),
            ],
            # List of FSD50K sound classes
            "vocabulary": [
                os.path.join(ground_truth_folder, "vocabulary.csv"),
                md5(
                    os.path.join(data_path, ground_truth_folder,
                                 "vocabulary.csv")),
            ],
            # Additional metadata
            "dev_clips_info": [
                os.path.join(metadata_folder, "dev_clips_info_FSD50K.json"),
                md5(
                    os.path.join(data_path, metadata_folder,
                                 "dev_clips_info_FSD50K.json")),
            ],
            "eval_clips_info": [
                os.path.join(metadata_folder, "eval_clips_info_FSD50K.json"),
                md5(
                    os.path.join(data_path, metadata_folder,
                                 "eval_clips_info_FSD50K.json")),
            ],
            # Relevant info about the labels
            "class_info": [
                os.path.join(metadata_folder, "class_info_FSD50K.json"),
                md5(
                    os.path.join(data_path, metadata_folder,
                                 "class_info_FSD50K.json")),
            ],
            # PP/PNP ratings
            "pp_pnp_ratings": [
                os.path.join(metadata_folder, "pp_pnp_ratings_FSD50K.json"),
                md5(
                    os.path.join(data_path, metadata_folder,
                                 "pp_pnp_ratings_FSD50K.json")),
            ],
        },
    }

    # Development audio folder
    dev_audio_dir = "FSD50K.dev_audio"
    eval_audio_dir = "FSD50K.eval_audio"
    dev_clips = glob.glob(os.path.join(data_path, dev_audio_dir, "*.wav"))
    eval_clips = glob.glob(os.path.join(data_path, eval_audio_dir, "*.wav"))

    # Store development clips
    for clip in dev_clips:
        clip_id = os.path.basename(clip).replace(".wav", "")
        index["clips"][clip_id] = {
            "audio": [
                os.path.join(dev_audio_dir, os.path.basename(clip)),
                md5(clip),
            ]
        }

    # Store evaluation clips
    for clip in eval_clips:
        clip_id = os.path.basename(clip).replace(".wav", "")
        index["clips"][clip_id] = {
            "audio": [
                os.path.join(eval_audio_dir, os.path.basename(clip)),
                md5(clip),
            ]
        }

    with open(INDEX_PATH, "w") as fhandle:
        json.dump(index, fhandle, indent=2)