示例#1
0
def download_from_remote(remote, save_dir, force_overwrite=False):
    """Download a remote dataset into path
    Fetch a dataset pointed by remote's url, save into path using remote's
    filename and ensure its integrity based on the MD5 Checksum of the
    downloaded file.

    Adapted from scikit-learn's sklearn.datasets.base._fetch_remote.

    Args:
        remote (RemoteFileMetadata): Named tuple containing remote dataset
            meta information: url, filename and checksum
        save_dir (str): Directory to save the file to. Usually `data_home`
        force_overwrite  (bool):
            If True, overwrite existing file with the downloaded file.
            If False, does not overwrite, but checks that checksum is consistent.

    Returns:
        file_path (str): Full path of the created file.
    """
    if remote.destination_dir is None:
        download_dir = save_dir
    else:
        download_dir = os.path.join(save_dir, remote.destination_dir)

    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    download_path = os.path.join(download_dir, remote.filename)
    if not os.path.exists(download_path) or force_overwrite:
        # If file doesn't exist or we want to overwrite, download it
        with DownloadProgressBar(
            unit='B', unit_scale=True, unit_divisor=1024, miniters=1
        ) as t:
            try:
                urllib.request.urlretrieve(
                    remote.url,
                    filename=download_path,
                    reporthook=t.update_to,
                    data=None,
                )
            except Exception as e:
                error_msg = """
                            mirdata failed to download the dataset!
                            Please try again in a few minutes.
                            If this error persists, please raise an issue at
                            https://github.com/mir-dataset-loaders/mirdata,
                            and tag it with 'broken-link'.
                            """
                print(error_msg)
                raise e

    checksum = md5(download_path)
    if remote.checksum != checksum:
        raise IOError(
            '{} has an MD5 checksum ({}) '
            'differing from expected ({}), '
            'file may be corrupted.'.format(download_path, checksum, remote.checksum)
        )
    return download_path
示例#2
0
def download_from_remote(remote, save_dir, force_overwrite=False):
    """Download a remote dataset into path
    Fetch a dataset pointed by remote's url, save into path using remote's
    filename and ensure its integrity based on the MD5 Checksum of the
    downloaded file.

    Adapted from scikit-learn's sklearn.datasets.base._fetch_remote.

    Parameters
    -----------
    remote: RemoteFileMetadata
        Named tuple containing remote dataset meta information: url, filename
        and checksum
    save_dir: string
        Directory to save the file to. Usually `data_home`
    force_overwrite: bool
        If True, overwrite existing file with the downloaded file.
        If False, does not overwrite, but checks that checksum is consistent.

    Returns
    -------
    file_path: string
        Full path of the created file.
    """
    if remote.destination_dir is None:
        download_dir = save_dir
    else:
        download_dir = os.path.join(save_dir, remote.destination_dir)

    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    download_path = os.path.join(download_dir, remote.filename)
    if not os.path.exists(download_path) or force_overwrite:
        # If file doesn't exist or we want to overwrite, download it
        with DownloadProgressBar(
            unit='B', unit_scale=True, miniters=1, desc=remote.url.split('/')[-1]
        ) as t:
            try:
                _download_large_file(remote.url, download_path, t.update_to)
            except HTTPError:
                error_msg = """
                            mirdata failed to download the dataset!
                            Please try again in a few minutes.
                            If this error persists, please raise an issue at
                            https://github.com/mir-dataset-loaders/mirdata,
                            and tag it with 'broken-link'.
                            """
                raise HTTPError(error_msg)

    checksum = md5(download_path)
    if remote.checksum != checksum:
        raise IOError(
            '{} has an MD5 checksum ({}) '
            'differing from expected ({}), '
            'file may be corrupted.'.format(download_path, checksum, remote.checksum)
        )
    return download_path
示例#3
0
文件: test_utils.py 项目: MTG/mirdata
def test_md5(mocker):
    audio_file = b"audio1234"

    expected_checksum = "6dc00d1bac757abe4ea83308dde68aab"

    mocker.patch("%s.open" % builtin_module_name,
                 new=mocker.mock_open(read_data=audio_file))

    md5_checksum = utils.md5("test_file_path")
    assert expected_checksum == md5_checksum
示例#4
0
def test_md5(mocker):
    audio_file = b'audio1234'

    expected_checksum = '6dc00d1bac757abe4ea83308dde68aab'

    mocker.patch('%s.open' % builtin_module_name,
                 new=mocker.mock_open(read_data=audio_file))

    md5_checksum = utils.md5('test_file_path')
    assert expected_checksum == md5_checksum
def make_ikala_index(ikala_data_path):
    lyrics_dir = os.path.join(ikala_data_path, 'Lyrics')
    lyrics_files = glob.glob(os.path.join(lyrics_dir, '*.lab'))
    track_ids = sorted(
        [os.path.basename(f).split('.')[0] for f in lyrics_files])

    # top-key level metadata
    metadata_checksum = md5(os.path.join(ikala_data_path, 'id_mapping.txt'))
    index_metadata = {
        "metadata": {
            "id_mapping": ("id_mapping.txt", metadata_checksum)
        }
    }

    # top-key level tracks
    index_tracks = {}
    for track_id in track_ids:
        audio_checksum = md5(
            os.path.join(ikala_data_path, "Wavfile/{}.wav".format(track_id)))
        pitch_checksum = md5(
            os.path.join(ikala_data_path, "PitchLabel/{}.pv".format(track_id)))
        lyrics_checksum = md5(
            os.path.join(ikala_data_path, "Lyrics/{}.lab".format(track_id)))

        index_tracks[track_id] = {
            "audio": ("Wavfile/{}.wav".format(track_id), audio_checksum),
            "pitch": ("PitchLabel/{}.pv".format(track_id), pitch_checksum),
            "lyrics": ("Lyrics/{}.lab".format(track_id), lyrics_checksum),
        }

    # top-key level version
    ikala_index = {"version": None}

    # combine all in dataset index
    ikala_index.update(index_metadata)
    ikala_index.update({"tracks": index_tracks})

    with open(IKALA_INDEX_PATH, 'w') as fhandle:
        json.dump(ikala_index, fhandle, indent=2)
示例#6
0
def make_gtzan_genre_index(data_path):
    index = {}
    for track_key, path in iter_paths(data_path):
        abspath = os.path.join(data_path, path)
        if not os.path.exists(abspath):
            print("Missing file: {}".format(abspath))
            continue

        checksum = md5(abspath)
        audio_path = os.path.join("gtzan_genre/genres", path)
        index[track_key] = {"audio": [audio_path, checksum]}

    with open(GTZAN_GENRE_INDEX_PATH, "w") as f:
        json.dump(index, f, indent=2)
示例#7
0
def update_index(all_indexes):
    """Function to update indexes to new format.
    Parameters
    ----------
    all_indexes (list): list of all current dataset indexes


    """

    for index_name in tqdm(all_indexes):
        module = index_name.replace('_index.json', '')

        # load old index
        old_index = mirdata.Dataset(module)._index

        # avoid modifying when running multiple times
        if 'tracks' in old_index.keys():
            old_index = old_index['tracks']

        data_home = mirdata.Dataset(module).data_home

        # get metadata checksum
        metadata_files = get_metadata_paths(module)
        metadata_checksums = None

        if metadata_files is not None:
            metadata_checksums = {key: [metadata_files[key],
                                        md5(os.path.join(data_home, metadata_files[key]))]
                                  for key in metadata_files.keys()}

        # get version of dataset
        version = get_dataset_version(module)

        # Some datasets have a single metadata file, some have multiple.
        # The computation of the checksum should be customized in the make_index
        # of each dataset. This is a patch to convert previous indexes to the new format.
        new_index = {'version': version,
                     'tracks': old_index}

        if metadata_files is not None:
            new_index['metadata'] =  metadata_checksums

        with open(os.path.join(INDEXES_PATH, index_name), 'w') as fhandle:
            json.dump(new_index, fhandle, indent=2)