def make_classicalDB_index(data_path): audio_dir = os.path.join(data_path, 'audio') key_dir = os.path.join(data_path, 'keys') spectrum_dir = os.path.join(data_path, 'spectrums') HPCP_dir = os.path.join(data_path, 'HPCPs') mb_dir = os.path.join(data_path, 'musicbrainz_metadata') classicalDB_index = {} for track_id, key_file in enumerate(sorted(os.listdir(key_dir))): if '.txt' in key_file: codec = '.wav' audio_path = os.path.join(audio_dir, os.path.splitext(key_file)[0] + codec) spectrum_path = os.path.join( spectrum_dir, os.path.splitext(key_file)[0] + '.json') HPCP_path = os.path.join(HPCP_dir, os.path.splitext(key_file)[0] + '.json') mb_path = os.path.join(mb_dir, os.path.splitext(key_file)[0] + '.json') key_path = os.path.join(key_dir, key_file) classicalDB_index[track_id] = { 'audio': (audio_path.replace(data_path + '/', ''), md5(audio_path)), 'key': (key_path.replace(data_path + '/', ''), md5(key_path)), 'spectrum': (spectrum_path.replace(data_path + '/', ''), md5(spectrum_path)), 'mb': (mb_path.replace(data_path + '/', ''), md5(mb_path)), 'HPCP': (HPCP_path.replace(data_path + '/', ''), md5(HPCP_path)) } with open(classicalDB_INDEX_PATH, 'w') as fhandle: json.dump(classicalDB_index, fhandle, indent=2)
def make_beatport_key_index(data_path): meta_dir = os.path.join(data_path, 'meta') audio_dir = os.path.join(data_path, 'audio') key_dir = os.path.join(data_path, 'keys') beatport_key_index = { 'version': '1.0.0', 'tracks': {}, 'metadata': None, } for track_id, ann_dir in enumerate(sorted(os.listdir(key_dir))): if '.txt' in ann_dir: codec = '.mp3' audio_path = os.path.join(audio_dir, ann_dir.replace('.txt', codec)) chord_path = os.path.join(key_dir, ann_dir) meta_path = os.path.join(meta_dir, ann_dir.replace('.txt', '.json')) if not os.path.exists(meta_path): meta = (None, None) else: meta = (meta_path.replace(data_path + '/', ''), md5(meta_path)) beatport_key_index['tracks'][track_id] = { 'audio': (audio_path.replace(data_path + '/', ''), md5(audio_path)), 'meta': meta, 'key': (chord_path.replace(data_path + '/', ''), md5(chord_path)), } with open(beatport_key_INDEX_PATH, 'w') as fhandle: json.dump(beatport_key_index, fhandle, indent=2)
def make_queen_index(data_path): annotations_dir = os.path.join(data_path, 'Queen', 'annotations') cds_dir = os.path.join(annotations_dir, 'all', 'Queen') audio_dir = os.path.join(data_path, 'Queen', 'audio') cds = sorted(os.listdir(cds_dir)) track_ids = [] queen_tracks = {} totfiles = [] track_id = 0 for c in cds: for t in sorted(os.listdir(os.path.join(cds_dir, c))): if 'ttl' in t: totfiles.append(t) if 'CD' in t: track_id = '10{}{}'.format( os.path.basename(c).split('_')[0][-1], os.path.basename(t).split('_')[2][:2], ) track_ids.append(track_id) # checksum audio_checksum = md5( os.path.join(audio_dir, c, '{}.flac'.format(t[:-4]))) audio_path = '{}/{}'.format( 'audio', os.path.join(c, '{}.flac'.format(t[:-4]))) annot_checksum, annot_rels = [], [] for annot_type in QUEEN_ANNOTATION_SCHEMA: cds_dir = os.path.join(annotations_dir, annot_type, 'Queen') annot_path = os.path.join(cds_dir, c) annot_file = '{}.lab'.format(t[:-4]) if os.path.exists(os.path.join(annot_path, annot_file)): annot_checksum.append( md5(os.path.join(annot_path, annot_file))) annot_rels.append( os.path.join('annotations', annot_type, 'Queen', c, annot_file)) else: annot_checksum.append(None) annot_rels.append(None) queen_tracks[track_id] = { 'audio': (audio_path, audio_checksum), 'chords': (annot_rels[0], annot_checksum[0]), 'keys': (annot_rels[1], annot_checksum[1]), 'sections': (annot_rels[2], annot_checksum[2]), } track_id += 1 queen_index = {'version': "1.0", 'tracks': queen_tracks, 'metadata': None} with open(QUEEN_INDEX_PATH, 'w') as fhandle: json.dump(queen_index, fhandle, indent=2)
def make_otmm_makam_index(dataset_data_path): otmm_index = {'version': 'dlfm2016', 'tracks': {}, 'metadata': []} for makam in os.listdir(os.path.join(dataset_data_path, 'data')): if '.' not in makam: for track in os.listdir( os.path.join(dataset_data_path, 'data', makam)): if '.json' in track: # Declare track attributes index = track.split('.json')[0] pitch_path = index + '.pitch' otmm_index['tracks'][index] = { "metadata": [ os.path.join( 'MTG-otmm_makam_recognition_dataset-f14c0d0', 'data', makam, track, ), md5( os.path.join(dataset_data_path, 'data', makam, track)), ], "pitch": [ os.path.join( 'MTG-otmm_makam_recognition_dataset-f14c0d0', 'data', makam, pitch_path, ), md5( os.path.join(dataset_data_path, 'data', makam, pitch_path)), ], } otmm_index['metadata'] = [ os.path.join('MTG-otmm_makam_recognition_dataset-f14c0d0', 'annotations.json'), md5(os.path.join(dataset_data_path, 'annotations.json')), ] with open(OTMM_MAKAM_INDEX_PATH, 'w') as fhandle: json.dump(otmm_index, fhandle, indent=2)
def test_md5(mocker): audio_file = b"audio1234" expected_checksum = "6dc00d1bac757abe4ea83308dde68aab" mocker.patch("builtins.open", new=mocker.mock_open(read_data=audio_file)) md5_checksum = validate.md5("test_file_path") assert expected_checksum == md5_checksum
def make_tonas_index(dataset_data_path): tonas_index = {"version": "1.0", "tracks": {}} for style in os.listdir(os.path.join(dataset_data_path)): if "." not in style: for track in os.listdir(os.path.join(dataset_data_path, style)): if ".wav" in track: # Declare track attributes index = track.replace(".wav", "") f0_path = index + ".f0.Corrected" notes_path = index + ".notes.Corrected" tonas_index["tracks"][index] = { "audio": [ os.path.join(style, track), md5(os.path.join(dataset_data_path, style, track)), ], "f0": [ os.path.join(style, f0_path), md5(os.path.join(dataset_data_path, style, f0_path)), ], "notes": [ os.path.join(style, notes_path), md5( os.path.join(dataset_data_path, style, notes_path)), ], } tonas_index["metadata"] = { "TONAS-Metadata": [ "TONAS-Metadata.txt", md5(os.path.join(dataset_data_path, "TONAS-Metadata.txt")), ] } with open(TONAS_INDEX_PATH, "w") as fhandle: json.dump(tonas_index, fhandle, indent=2)
def make_dataset_index(dataset_data_path): annotation_dir = os.path.join(dataset_data_path, "annotation") annotation_files = glob.glob(os.path.join(annotation_dir, "*.lab")) track_ids = sorted( [os.path.basename(f).split(".")[0] for f in annotation_files]) # top-key level metadata metadata_checksum = md5(os.path.join(dataset_data_path, "id_mapping.txt")) index_metadata = { "metadata": { "id_mapping": ("id_mapping.txt", metadata_checksum) } } # top-key level tracks index_tracks = {} for track_id in track_ids: audio_checksum = md5( os.path.join(dataset_data_path, "Wavfile/{}.wav".format(track_id))) annotation_checksum = md5( os.path.join(dataset_data_path, "annotation/{}.lab".format(track_id))) index_tracks[track_id] = { "audio": ("Wavfile/{}.wav".format(track_id), audio_checksum), "annotation": ("annotation/{}.lab".format(track_id), annotation_checksum), } # top-key level version dataset_index = {"version": None} # combine all in dataset index dataset_index.update(index_metadata) dataset_index.update({"tracks": index_tracks}) with open(DATASET_INDEX_PATH, "w") as fhandle: json.dump(dataset_index, fhandle, indent=2)
def update_index(all_indexes): """Function to update indexes to new format. Parameters ---------- all_indexes (list): list of all current dataset indexes """ for index_name in tqdm(all_indexes): module = index_name.replace("_index.json", "") # load old index old_index = mirdata.initialize(module)._index # avoid modifying when running multiple times if "tracks" in old_index.keys(): old_index = old_index["tracks"] data_home = mirdata.initialize(module).data_home # get metadata checksum metadata_files = get_metadata_paths(module) metadata_checksums = None if metadata_files is not None: metadata_checksums = { key: [ metadata_files[key], md5(os.path.join(data_home, metadata_files[key])), ] for key in metadata_files.keys() } # get version of dataset version = get_dataset_version(module) # Some datasets have a single metadata file, some have multiple. # The computation of the checksum should be customized in the make_index # of each dataset. This is a patch to convert previous indexes to the new format. new_index = {"version": version, "tracks": old_index} if metadata_files is not None: new_index["metadata"] = metadata_checksums with open(os.path.join(INDEXES_PATH, index_name), "w") as fhandle: json.dump(new_index, fhandle, indent=2)
def make_acousticbrainz_genre_index(data_path): index = 0 datasets = ['tagtraum', 'allmusic', 'lastfm', 'discogs'] dataset_types = ['validation', 'train'] f = open(acousticbrainz_genre_INDEX_PATH, 'w') f.write('{\n') for dataset, dataset_type in itertools.product(datasets, dataset_types): tsv_file = open( os.path.join( data_path, "acousticbrainz-mediaeval-" + dataset + "-" + dataset_type + ".tsv")) read_tsv = csv.reader(tsv_file, delimiter="\t") next(read_tsv, None) read_tsv_list = list(read_tsv) for line, row in enumerate(read_tsv_list): mbid = "" track_id = dataset + '#' + dataset_type for i, r in enumerate(row): track_id = track_id + '#' + r if i == 0: mbid = r ann_path = os.path.join(data_path, "acousticbrainz-mediaeval-" + dataset_type, mbid[:2], mbid + ".json") f.write(' \"%s\": {\n' % (track_id, )) f.write(' \"data\": [\n') f.write(' \"%s\",\n' % (ann_path.replace(data_path + '/', ''), )) f.write(' \"%s\"\n' % md5(ann_path)) f.write(' ]\n') is_the_last = dataset == datasets[ -1] and dataset_type == dataset_types[-1] and line == len( read_tsv_list) - 1 if not is_the_last: f.write(' },\n') else: f.write(' }\n') index += 1 f.write('}')
def make_saraga_hindustani_index(dataset_data_path): saraga_index = { 'version': 1.5, 'tracks': {}, } idx = 0 dataset_data_path_prev = dataset_data_path.split('saraga1.5_hindustani')[0] for concert in os.listdir(dataset_data_path): if '.' not in concert: for song in os.listdir(os.path.join(dataset_data_path, concert)): if '.' not in song: # Declare track attributes index = str(idx) + '_' + song.replace(' ', '_') print(index) audio = (None, None) ctonic = (None, None) pitch = (None, None) tempo = (None, None) sama = (None, None) sections = (None, None) phrases = (None, None) metadata = (None, None) for file in os.listdir( os.path.join(dataset_data_path, concert, song)): if '.mp3' in file: audio_path = os.path.join('saraga1.5_hindustani/', concert, song, file) audio_checksum = md5( os.path.join(dataset_data_path_prev, audio_path)) audio = (audio_path, audio_checksum) if 'ctonic' in file: ctonic_path = os.path.join('saraga1.5_hindustani/', concert, song, file) ctonic_checksum = md5( os.path.join(dataset_data_path_prev, ctonic_path)) ctonic = (ctonic_path, ctonic_checksum) if 'pitch.' in file: pitch_path = os.path.join('saraga1.5_hindustani/', concert, song, file) pitch_checksum = md5( os.path.join(dataset_data_path_prev, pitch_path)) pitch = (pitch_path, pitch_checksum) if 'tempo-manual' in file: tempo_path = os.path.join('saraga1.5_hindustani/', concert, song, file) tempo_checksum = md5( os.path.join(dataset_data_path_prev, tempo_path)) tempo = (tempo_path, tempo_checksum) if 'sama-manual' in file: sama_path = os.path.join('saraga1.5_hindustani/', concert, song, file) sama_checksum = md5( os.path.join(dataset_data_path_prev, sama_path)) sama = (sama_path, sama_checksum) if 'sections-manual-p' in file: sections_path = os.path.join( 'saraga1.5_hindustani/', concert, song, file) sections_checksum = md5( os.path.join(dataset_data_path_prev, sections_path)) sections = (sections_path, sections_checksum) if 'mphrase' in file: phrases_path = os.path.join( 'saraga1.5_hindustani/', concert, song, file) phrases_checksum = md5( os.path.join(dataset_data_path_prev, phrases_path)) phrases = (phrases_path, phrases_checksum) if '.json' in file: metadata_path = os.path.join( 'saraga1.5_hindustani/', concert, song, file) metadata_checksum = md5( os.path.join(dataset_data_path_prev, metadata_path)) metadata = (metadata_path, metadata_checksum) saraga_index['tracks'][index] = { 'audio': audio, 'ctonic': ctonic, 'pitch': pitch, 'tempo': tempo, 'sama': sama, 'sections': sections, 'phrases': phrases, 'metadata': metadata } idx = idx + 1 with open(SARAGA_HINDUSTANI_INDEX_PATH, 'w') as fhandle: json.dump(saraga_index, fhandle, indent=2)
def make_dataset_index(data_path): audio_dir = os.path.join(data_path, "audio_wav_22050_mono") index = {"version": "1.2.3", "tracks": {}, "multitracks": {}} # define pieces directly from data directory pieces = sorted( list( set([ "_".join(filename.split("/")[-1].split("_")[:4]) for filename in glob.glob(os.path.join(audio_dir, "*.wav")) ]))) for ip, piece in enumerate(pieces): index["multitracks"][piece] = {} ## add mixture audios # STM audio_mix_dir = os.path.join(data_path, "audio_wav_22050_mono", "{}_Stereo_STM.wav".format(piece)) audio_checksum = md5(audio_mix_dir) index["multitracks"][piece]["audio_stm"] = ( "audio_wav_22050_mono/{}_Stereo_STM.wav".format(piece), audio_checksum, ) # STR audio_mix_dir = os.path.join(data_path, "audio_wav_22050_mono", "{}_Stereo_STR.wav".format(piece)) audio_checksum = md5(audio_mix_dir) index["multitracks"][piece]["audio_str"] = ( "audio_wav_22050_mono/{}_Stereo_STR.wav".format(piece), audio_checksum, ) # STL audio_mix_dir = os.path.join(data_path, "audio_wav_22050_mono", "{}_Stereo_STL.wav".format(piece)) audio_checksum = md5(audio_mix_dir) index["multitracks"][piece]["audio_stl"] = ( "audio_wav_22050_mono/{}_Stereo_STL.wav".format(piece), audio_checksum, ) # STRev audio_mix_dir = os.path.join(data_path, "audio_wav_22050_mono", "{}_StereoReverb_STM.wav".format(piece)) audio_checksum = md5(audio_mix_dir) index["multitracks"][piece]["audio_rev"] = ( "audio_wav_22050_mono/{}_StereoReverb_STM.wav".format(piece), audio_checksum, ) # beats index["multitracks"][piece]["beat"] = (None, None) # piano index["multitracks"][piece]["audio_spl"] = (None, None) index["multitracks"][piece]["audio_spr"] = (None, None) ## add each track inside the multitrack audio_files = sorted( glob.glob(os.path.join(audio_dir, "{}*.wav".format(piece)))) singers = [ singer.split("_")[-2] for singer in audio_files if not "Stereo" in singer ] # second step to remove piano from singers singers = [singer for singer in singers if "Piano" not in singer] # mics = [singer.split('_')[-1].split('.')[0] for singer in audio_files if not 'Stereo' in singer] # assert len(singers) == len(mics), "number of mics does not match number of singers for {}".format(piece) # set_singers = set(singers) index["multitracks"][piece]["tracks"] = [] for sidx, singer in enumerate(sorted(singers)): track_name = "{}_{}".format(piece, singer) # define fields as None index["tracks"][track_name] = { "audio_dyn": (None, None), "audio_hsm": (None, None), "audio_lrx": (None, None), "f0_crepe_dyn": (None, None), "f0_crepe_hsm": (None, None), "f0_crepe_lrx": (None, None), "f0_pyin_dyn": (None, None), "f0_pyin_hsm": (None, None), "f0_pyin_lrx": (None, None), "f0_manual_lrx": (None, None), "score": (None, None), } index["multitracks"][piece]["tracks"].append(track_name) mics = [ mic.split("_")[-1].split(".")[0] for mic in glob.glob( os.path.join(audio_dir, "{}_{}*.wav".format( piece, singer))) if mic not in ["SPL", "SPR"] ] ### add all fields for each track for mic in mics: ## add audio audio_stem_dir = os.path.join( data_path, "audio_wav_22050_mono", "{}_{}_{}.wav".format(piece, singer, mic), ) audio_checksum = md5(audio_stem_dir) index["tracks"][track_name]["audio_{}".format(mic.lower())] = ( "audio_wav_22050_mono/{}_{}_{}.wav".format( piece, singer, mic), audio_checksum, ) ## add crepe f0s crepe_dir = os.path.join( data_path, "annotations_csv_F0_CREPE", "{}_{}_{}.csv".format(piece, singer, mic), ) crepe_checksum = md5(crepe_dir) index["tracks"][track_name]["f0_crepe_{}".format( mic.lower())] = ( "annotations_csv_F0_CREPE/{}_{}_{}.csv".format( piece, singer, mic), crepe_checksum, ) ## add pyin f0s pyin_dir = os.path.join( data_path, "annotations_csv_F0_PYIN", "{}_{}_{}.csv".format(piece, singer, mic), ) pyin_checksum = md5(pyin_dir) index["tracks"][track_name]["f0_pyin_{}".format( mic.lower())] = ( "annotations_csv_F0_PYIN/{}_{}_{}.csv".format( piece, singer, mic), pyin_checksum, ) ## add score when it exists # some have no associated score if not any(x in piece for x in NO_SCORE): score_dir = os.path.join( data_path, "annotations_csv_scorerepresentation", "{}_Stereo_STM_{}.csv".format(piece, singer[0]), ) score_checksum = md5(score_dir) index["tracks"][track_name]["score"] = ( "annotations_csv_scorerepresentation/{}_Stereo_STM_{}.csv" .format(piece, singer[0]), score_checksum, ) ## add beats for the full songs when available if not any(x in piece for x in NO_SCORE): ## add beats beats_dir = os.path.join(data_path, "annotations_csv_beat", "{}_Stereo_STM.csv".format(piece)) beats_checksum = md5(beats_dir) index["multitracks"][piece]["beat"] = ( "annotations_csv_beat/{}_Stereo_STM.csv".format(piece), beats_checksum, ) ## check if piano track exists and add it to the mtrack if so audio_pianoL_dir = os.path.join(data_path, "audio_wav_22050_mono", "{}_Piano_SPL.wav".format(piece)) if os.path.exists(audio_pianoL_dir): # add piano SPL audio_checksum = md5(audio_pianoL_dir) index["multitracks"][piece]["audio_spl"] = ( "audio_wav_22050_mono/{}_Piano_SPL.wav".format(piece), audio_checksum, ) # add piano SPR audio_checksum = md5(audio_pianoL_dir.replace("SPL", "SPR")) index["multitracks"][piece]["audio_spr"] = ( "audio_wav_22050_mono/{}_Piano_SPR.wav".format(piece), audio_checksum, ) # tracks should not be repeated index["multitracks"][piece]["tracks"] = sorted( list(set(index["multitracks"][piece]["tracks"]))) ## add the manual annotations to their corresponding tracks manual_files = sorted( glob.glob(os.path.join(data_path, "annotations_csv_F0_manual", "*.csv"))) for mf in manual_files: track_name = "_".join(os.path.basename(mf).split("_")[:-1]) manual_checksum = md5(mf) index["tracks"][track_name]["f0_manual_lrx"] = ( "annotations_csv_F0_manual/{}".format(os.path.basename(mf)), manual_checksum, ) with open(DATASET_INDEX_PATH, "w") as fhandle: json.dump(index, fhandle, indent=2)
def download_from_remote(remote, save_dir, force_overwrite): """Download a remote dataset into path Fetch a dataset pointed by remote's url, save into path using remote's filename and ensure its integrity based on the MD5 Checksum of the downloaded file. Adapted from scikit-learn's sklearn.datasets.base._fetch_remote. Args: remote (RemoteFileMetadata): Named tuple containing remote dataset meta information: url, filename and checksum save_dir (str): Directory to save the file to. Usually `data_home` force_overwrite (bool): If True, overwrite existing file with the downloaded file. If False, does not overwrite, but checks that checksum is consistent. Returns: str: Full path of the created file. """ if remote.destination_dir is None: download_dir = save_dir else: download_dir = os.path.join(save_dir, remote.destination_dir) if not os.path.exists(download_dir): os.makedirs(download_dir) download_path = os.path.join(download_dir, remote.filename) if not os.path.exists(download_path) or force_overwrite: # if we got here, we want to overwrite any existing file if os.path.exists(download_path): os.remove(download_path) # If file doesn't exist or we want to overwrite, download it with DownloadProgressBar( unit="B", unit_scale=True, unit_divisor=1024, miniters=1 ) as t: try: urllib.request.urlretrieve( remote.url, filename=download_path, reporthook=t.update_to, data=None, ) except Exception as exc: error_msg = """ mirdata failed to download the dataset from {}! Please try again in a few minutes. If this error persists, please raise an issue at https://github.com/mir-dataset-loaders/mirdata, and tag it with 'broken-link'. """.format( remote.url ) logging.error(error_msg) raise exc else: logging.info( "{} already exists and will not be downloaded. ".format(download_path) + "Rerun with force_overwrite=True to delete this file and force the download." ) checksum = md5(download_path) if remote.checksum != checksum: raise IOError( "{} has an MD5 checksum ({}) " "differing from expected ({}), " "file may be corrupted.".format(download_path, checksum, remote.checksum) ) return download_path
def make_dataset_index(data_path): pieces = ['beethoven', 'bruckner', 'mahler', 'mozart'] families = { 'doublebass': 'strings', 'cello': 'strings', 'clarinet': 'woodwinds', 'viola': 'strings', 'violin': 'strings', 'oboe': 'woodwinds', 'flute': 'woodwinds', 'trumpet': 'brass', 'bassoon': 'woodwinds', 'horn': 'brass', } totalinstruments = [20, 39, 30, 10] ninstruments = [10, 10, 10, 8] index = {'version': 1} index['tracks'] = {} index['multitracks'] = {} for ip, piece in enumerate(pieces): index['multitracks'][piece] = {} audio_files = sorted( glob.glob(os.path.join(data_path, 'audio', piece, '*.wav'))) instruments = [ os.path.basename(audio_path).split('.')[0].rstrip(string.digits) for audio_path in audio_files ] set_instruments = list(set(instruments)) assert (len(instruments) == totalinstruments[ip] ), 'audio files for some instruments are missing' assert (len(set_instruments) == ninstruments[ip] ), 'some instruments are missing from the dataset' index['multitracks'][piece]['tracks'] = [] for instrument in set_instruments: assert ( instrument in families.keys() ), "instrument {} is not in the list of dataset instruments".format( instrument) index['tracks'][piece + '-' + instrument] = {} index['multitracks'][piece]['tracks'].append(piece + '-' + instrument) #### add audios instrument_audio_files = sorted( glob.glob( os.path.join(data_path, 'audio', piece, instrument + '*.wav'))) assert (len(instrument_audio_files) > 0), 'no audio has been found for {}'.format(instrument) for i, audio_file in enumerate(instrument_audio_files): audio_checksum = md5( os.path.join(data_path, 'audio', piece, os.path.basename(audio_file))) source = os.path.basename(audio_file).replace('.wav', '') index['tracks'][piece + '-' + instrument]['audio_' + source] = ( 'audio/{}/{}'.format( piece, os.path.basename(audio_file)), audio_checksum, ) #### add scores assert os.path.exists( os.path.join(data_path, 'annotations', piece, '{}.txt'.format(instrument)) ), 'cannot find score file {}'.formatos.path.join( data_path, 'annotations', piece, '{}.txt'.format(instrument)) assert os.path.exists( os.path.join(data_path, 'annotations', piece, '{}_o.txt'.format(instrument)) ), 'cannot find score file {}'.formatos.path.join( data_path, 'annotations', piece, '{}_o.txt'.format(instrument)) score_checksum = md5( os.path.join(data_path, 'annotations', piece, '{}.txt'.format(instrument))) score_original_checksum = md5( os.path.join(data_path, 'annotations', piece, '{}_o.txt'.format(instrument))) index['tracks'][piece + '-' + instrument]['notes'] = ( 'annotations/{}/{}.txt'.format(piece, instrument), score_checksum, ) index['tracks'][piece + '-' + instrument]['notes_original'] = ( 'annotations/{}/{}_o.txt'.format(piece, instrument), score_original_checksum, ) with open(DATASET_INDEX_PATH, 'w') as fhandle: json.dump(index, fhandle, indent=2)
def make_jingju_acappella_index(dataset_data_path): jingju_index = {"version": 7.0, "tracks": {}, "metadata": {}} # Building the index while parsing the audio path for folder in os.listdir(dataset_data_path): if "wav" in folder: for folder_ in os.listdir(os.path.join(dataset_data_path, folder)): if "." not in folder_: for song in os.listdir( os.path.join(dataset_data_path, folder, folder_)): if ".DS" not in song: index = song.replace(".wav", "").replace(".WAV", "") jingju_index["tracks"][index] = { "audio": (None, None), "phoneme": (None, None), "phrase_char": (None, None), "phrase": (None, None), "syllable": (None, None), } jingju_index["tracks"][index]["audio"] = ( os.path.join(folder, folder_, song), md5( os.path.join(dataset_data_path, folder, folder_, song)), ) # Parsing annotations and textgrid for folder in os.listdir(dataset_data_path): if "annotation_txt" in folder: for folder_ in os.listdir(os.path.join(dataset_data_path, folder)): if "." not in folder_: for file in os.listdir( os.path.join(dataset_data_path, folder, folder_)): if "phoneme" in file: index = file.replace("_phoneme.txt", "") jingju_index["tracks"][index]["phoneme"] = ( os.path.join(folder, folder_, file), md5( os.path.join(dataset_data_path, folder, folder_, file)), ) if "phrase_char" in file: index = file.replace("_phrase_char.txt", "") jingju_index["tracks"][index]["phrase_char"] = ( os.path.join(folder, folder_, file), md5( os.path.join(dataset_data_path, folder, folder_, file)), ) if "phrase.txt" in file: index = file.replace("_phrase.txt", "") jingju_index["tracks"][index]["phrase"] = ( os.path.join(folder, folder_, file), md5( os.path.join(dataset_data_path, folder, folder_, file)), ) if "syllable" in file: index = file.replace("_syllable.txt", "") jingju_index["tracks"][index]["syllable"] = ( os.path.join(folder, folder_, file), md5( os.path.join(dataset_data_path, folder, folder_, file)), ) # Parsing metadata for file in os.listdir(dataset_data_path): if "catalogue" in file: if "dan" in file: jingju_index["metadata"]["dan_metadata"] = ( file, md5(os.path.join(dataset_data_path, file)), ) else: jingju_index["metadata"]["laosheng_metadata"] = ( file, md5(os.path.join(dataset_data_path, file)), ) with open(JINJGU_ACAPPELLA_INDEX_PATH, "w") as fhandle: json.dump(jingju_index, fhandle, indent=2)
def make_index(data_path): _index = {} index_file = csv.reader( open(os.path.join(data_path, "billboard-2.0-index.csv"))) for row in index_file: k = row[0] _index[k] = row[1:] annotations_dir = os.path.join(data_path, "McGill-Billboard") audio_dir = os.path.join(data_path, "audio") anns = sorted(os.listdir(annotations_dir)) track_ids = [] index = {} index = { "version": "2.0", "tracks": {}, "metadata": None, } txtfiles = [] for a in anns: for t in os.listdir(os.path.join(annotations_dir, a)): if t == "salami_chords.txt": fp = os.path.join(annotations_dir, a, t) track_id = "{}".format(os.path.basename(a.lstrip("0"))) if track_id in _index.keys(): txtfiles.append(t) track_ids.append(track_id) release_date = _index[track_id][0] track_name = _index[track_id][3] artist = _index[track_id][4] _release_date = "{}s".format( round(int(release_date.split("-")[0]), -1)) audio_path = os.path.join(audio_dir, _release_date, artist, track_name, "audio.flac") audio_checksum = None if os.path.exists(audio_path): audio_checksum = md5(audio_path) else: audio_path = None annot_rel = os.path.join("annotation", a, t) audio_rel = os.path.join("audio", _release_date, artist, track_name, "audio.flac") annot_checksum = md5(fp) full_fp = os.path.join(annotations_dir, a, "full.lab") majmin7 = os.path.join(annotations_dir, a, "majmin7.lab") majmin7inv = os.path.join(annotations_dir, a, "majmin7inv.lab") majmin = os.path.join(annotations_dir, a, "majmin.lab") majmininv = os.path.join(annotations_dir, a, "majmininv.lab") bothchroma = os.path.join(annotations_dir, a, "bothchroma.csv") tuning = os.path.join(annotations_dir, a, "tuning.csv") index["tracks"][track_id] = { "audio": (audio_rel, audio_checksum), "salami": (annot_rel, annot_checksum), "bothchroma": ( os.path.join("McGill-Billboard", a, "bothchroma.csv"), md5(bothchroma), ), "tuning": ( os.path.join("McGill-Billboard", a, "tuning.csv"), md5(tuning), ), "lab_full": ( os.path.join("McGill-Billboard", a, "full.lab"), md5(full_fp), ), "lab_majmin7": ( os.path.join("McGill-Billboard", a, "majmin7.lab"), md5(majmin7), ), "lab_majmin7inv": ( os.path.join("McGill-Billboard", a, "majmin7inv.lab"), md5(majmin7inv), ), "lab_majmin": ( os.path.join("McGill-Billboard", a, "majmin.lab"), md5(majmin), ), "lab_majmininv": ( os.path.join("McGill-Billboard", a, "majmininv.lab"), md5(majmininv), ), } with open(INDEX_PATH, "w") as fhandle: json.dump(index, fhandle, indent=2)
def make_saraga_carnatic_index(dataset_data_path): saraga_index = { 'version': 1.5, 'tracks': {}, } idx = 0 dataset_data_path_prev = dataset_data_path.split('saraga1.5_carnatic/')[0] for concert in os.listdir(dataset_data_path): if '.' not in concert: for song in os.listdir(os.path.join(dataset_data_path, concert)): if '.' not in song: # Declare track attributes index = str(idx) + '_' + song.replace(' ', '_') print(index) audio = (None, None) audio_ghatam = (None, None) audio_mridangam_left = (None, None) audio_mridangam_right = (None, None) audio_violin = (None, None) audio_vocal = (None, None) audio_vocal_s = (None, None) ctonic = (None, None) pitch = (None, None) pitch_v = (None, None) tempo = (None, None) sama = (None, None) sections = (None, None) phrases = (None, None) metadata = (None, None) for file in os.listdir( os.path.join(dataset_data_path, concert, song)): if '.mp3' in file: if 'multitrack' in file: if 'ghatam' in file: audio_ghatam_path = os.path.join( 'saraga1.5_carnatic', concert, song, file) audio_ghatam_checksum = md5( os.path.join(dataset_data_path_prev, audio_ghatam_path)) audio_ghatam = (audio_ghatam_path, audio_ghatam_checksum) if 'mridangam-left' in file: audio_mridangam_left_path = os.path.join( 'saraga1.5_carnatic', concert, song, file) audio_mridangam_left_checksum = md5( os.path.join( dataset_data_path_prev, audio_mridangam_left_path)) audio_mridangam_left = ( audio_mridangam_left_path, audio_mridangam_left_checksum) if 'mridangam-right' in file: mridangam_right_path = os.path.join( 'saraga1.5_carnatic', concert, song, file) mridangam_right_checksum = md5( os.path.join(dataset_data_path_prev, mridangam_right_path)) audio_mridangam_right = ( mridangam_right_path, mridangam_right_checksum) if 'violin' in file: audio_violin_path = os.path.join( 'saraga1.5_carnatic', concert, song, file) audio_violin_checksum = md5( os.path.join(dataset_data_path_prev, audio_violin_path)) audio_violin = (audio_violin_path, audio_violin_checksum) if 'vocal-s' in file: audio_vocal_s_path = os.path.join( 'saraga1.5_carnatic', concert, song, file) audio_vocal_s_checksum = md5( os.path.join(dataset_data_path_prev, audio_vocal_s_path)) audio_vocal_s = (audio_vocal_s_path, audio_vocal_s_checksum) if 'vocal.' in file: audio_vocal_path = os.path.join( 'saraga1.5_carnatic', concert, song, file) audio_vocal_checksum = md5( os.path.join(dataset_data_path_prev, audio_vocal_path)) audio_vocal = (audio_vocal_path, audio_vocal_checksum) else: audio_path = os.path.join( 'saraga1.5_carnatic', concert, song, file) audio_checksum = md5( os.path.join(dataset_data_path_prev, audio_path)) audio = (audio_path, audio_checksum) if 'ctonic.' in file: ctonic_path = os.path.join('saraga1.5_carnatic', concert, song, file) ctonic_checksum = md5( os.path.join(dataset_data_path_prev, ctonic_path)) ctonic = (ctonic_path, ctonic_checksum) if 'pitch.' in file: pitch_path = os.path.join('saraga1.5_carnatic', concert, song, file) pitch_checksum = md5( os.path.join(dataset_data_path_prev, pitch_path)) pitch = (pitch_path, pitch_checksum) if 'pitch-vocal' in file: pitch_v_path = os.path.join( 'saraga1.5_carnatic', concert, song, file) pitch_v_checksum = md5( os.path.join(dataset_data_path_prev, pitch_v_path)) pitch_v = (pitch_v_path, pitch_v_checksum) if 'tempo-manual' in file: tempo_path = os.path.join('saraga1.5_carnatic', concert, song, file) tempo_checksum = md5( os.path.join(dataset_data_path_prev, tempo_path)) tempo = (tempo_path, tempo_checksum) if 'sama-manual' in file: sama_path = os.path.join('saraga1.5_carnatic', concert, song, file) sama_checksum = md5( os.path.join(dataset_data_path_prev, sama_path)) sama = (sama_path, sama_checksum) if 'sections-manual-p.txt' in file: sections_path = os.path.join( 'saraga1.5_carnatic', concert, song, file) sections_checksum = md5( os.path.join(dataset_data_path_prev, sections_path)) sections = (sections_path, sections_checksum) if 'mphrase' in file: phrases_path = os.path.join( 'saraga1.5_carnatic', concert, song, file) phrases_checksum = md5( os.path.join(dataset_data_path_prev, phrases_path)) phrases = (phrases_path, phrases_checksum) if '.json' in file: metadata_path = os.path.join( 'saraga1.5_carnatic', concert, song, file) metadata_checksum = md5( os.path.join(dataset_data_path_prev, metadata_path)) metadata = (metadata_path, metadata_checksum) saraga_index['tracks'][index] = { 'audio-mix': audio, 'audio-ghatam': audio_ghatam, 'audio-mridangam-left': audio_mridangam_left, 'audio-mridangam-right': audio_mridangam_right, 'audio-violin': audio_violin, 'audio-vocal-s': audio_vocal_s, 'audio-vocal': audio_vocal, 'ctonic': ctonic, 'pitch': pitch, 'pitch-vocal': pitch_v, 'tempo': tempo, 'sama': sama, 'sections': sections, 'phrases': phrases, 'metadata': metadata, } idx = idx + 1 with open(SARAGA_CARNATIC_INDEX_PATH, 'w') as fhandle: json.dump(saraga_index, fhandle, indent=2)