def process_h5_file_info(self, h5): """ This function does 3 simple things: - open the song file - get info - close the file """ try: trackId = Getters.get_track_id(h5) tags = Getters.get_artist_mbtags(h5); timbres_list = Getters.get_segments_timbre(h5) except: return 0 if len(tags) == 0: return 0 tag_list = np.zeros(NUMBER_OF_TAGS) someSeen = False for tag in tags: if tag in self.styles.keys(): tag_list[self.styles[tag]] = 1 someSeen = True if not someSeen: return 0 if len(timbres_list) < 300: return 0 created = 0 #only take 5 at most for i in range(0,min((len(timbres_list)/400),5)*400, 400): timbres_list_segment = timbres_list[i:(i + 300), ] self.ids_list.append(trackId) self.tags_list.append(tag_list) self.timbres_list.append(timbres_list_segment) print(Getters.get_artist_name() + ": " + Getters.get_title(h5)) created+=1 return created
OUTDIR = '../data/audio' wmf_item2i = pickle.load(open('../data/wmf/index_dicts.pkl', 'rb'))['item2i'] track_to_song = pickle.load(open('../data/wmf/track_to_song.pkl', 'rb')) h5path = '../data/song_metadata/msd_summary_file.h5' if not os.path.isdir(OUTDIR): os.mkdir(OUTDIR) h5 = hdf5_utils.open_h5_file_read(h5path) num_songs = GETTERS.get_num_songs(h5) for i in range(num_songs): artist_name = GETTERS.get_artist_name(h5, songidx=i).decode('utf-8') track_name = GETTERS.get_title(h5, songidx=i).decode('utf-8') track_id = GETTERS.get_track_id(h5, songidx=i).decode('utf-8') out_path = os.path.join(OUTDIR, os.path.splitext(track_id)[0]) + '.mp3' if os.path.exists( out_path) or not track_to_song[track_id] in wmf_item2i.keys(): continue track_name = re.sub('_', '', track_name) artist_name_re = re.sub(' *([;_/&,*]|(feat))+.*', '', artist_name) artist_name_re = re.sub(' *[\[\(]*feat*.*[\]\)]*', '', artist_name_re, flags=re.IGNORECASE) track_name_re = re.sub(' *[\[\(]+.*[\]\)]+', '', track_name) artist_name_re = re.sub(' *[\[\(]*featuring*.*[\]\)]*',
def hdf5_to_csv(directory): with open("msds.csv", "w") as csvfile: index = 0 # Column headers headers = "index,artist_name,danceability,duration,end_of_fade_in,energy,key,key_confidence,loudness,mode," \ "mode_confidence,artist_hotttness,song_hotttness,start_of_fade_out,tempo,time_signature," \ "time_signature_confidence,title,release,year,track_id" csvfile.write(headers) csvfile.write("\n") # Recursively visit every sub-dir until we find the h5 files for root, dirs, filenames in os.walk(directory): for file in filenames: # print(os.path.join(root, file)) # Use the hd5 wrappers to open the file h5_file = hdf5_getters.open_h5_file_read(os.path.join(root, file)) # EXTRACT FEATURES!!!! and remove punctuation from strings # Artist name artist_name = hdf5_getters.get_artist_name(h5_file) # artist = re.sub(punc_re, "", artist_name) artist = artist_name.decode('UTF-8') # Danceability danceability = hdf5_getters.get_danceability(h5_file) # Duration duration = hdf5_getters.get_duration(h5_file) # End of fade in end_of_fade_in = hdf5_getters.get_end_of_fade_in(h5_file) # Energy energy = hdf5_getters.get_energy(h5_file) # Key key = hdf5_getters.get_key(h5_file) # Key confidence key_confidence = hdf5_getters.get_key_confidence(h5_file) # Loudness loudness = hdf5_getters.get_loudness(h5_file) # Mode mode = hdf5_getters.get_mode(h5_file) # Mode confidence mode_confidence = hdf5_getters.get_mode_confidence(h5_file) # artist HOTTTNESS artist_hotttness = hdf5_getters.get_artist_hotttnesss(h5_file) # song HOTTTNESS song_hotttness = hdf5_getters.get_song_hotttnesss(h5_file) # Start of fade out start_of_fade_out = hdf5_getters.get_start_of_fade_out(h5_file) # Tempo tempo = hdf5_getters.get_tempo(h5_file) # Time signature time_signature = hdf5_getters.get_time_signature(h5_file) # Time signature confidence time_signature_confidence = hdf5_getters.get_time_signature_confidence(h5_file) # Song title song_title = hdf5_getters.get_title(h5_file) # title = re.sub(punc_re, "", song_title) title = song_title.decode('UTF-8') # Track ID track_id = hdf5_getters.get_track_id(h5_file) song_id = track_id.decode('UTF-8') # Release (I think this means the album title) release = hdf5_getters.get_release(h5_file).decode('UTF-8') # Year year = hdf5_getters.get_year(h5_file) # Number of songs in file? num_songs = hdf5_getters.get_num_songs(h5_file) # Close the file h5_file.close() data = str(index) + "," + artist + "," + str(danceability) + "," + str(duration) + "," + str(end_of_fade_in) + "," + \ str(energy) + "," + str(key) + "," + str(key_confidence) + "," + str(loudness) + "," + \ str(mode) + "," + str(mode_confidence) + "," + str(artist_hotttness) + "," + str(song_hotttness)\ + "," + str(start_of_fade_out) + "," + str(tempo) + "," + str(time_signature) + "," + \ str(time_signature_confidence) + "," + title.encode("UTF-8") + "," + release + "," + str(year) + "," + song_id csvfile.write(data) csvfile.write("\n") index += 1 print("{} by {}".format(title, artist.encode("UTF-8"))) print("Processed: {}".format(index))
segments_loudness_max = hdf5_getters.get_segments_loudness_max(h5) segments_loudness_max_time = hdf5_getters.get_segments_loudness_max_time(h5) segments_loudness_start = hdf5_getters.get_segments_loudness_start(h5) segments_pitches = hdf5_getters.get_segments_pitches(h5) segments_start = hdf5_getters.get_segments_start(h5) segments_timbre = hdf5_getters.get_segments_timbre(h5) similar_artists = hdf5_getters.get_similar_artists(h5) song_hotttnesss = hdf5_getters.get_song_hotttnesss(h5) song_id = hdf5_getters.get_song_id(h5) start_of_fade_out = hdf5_getters.get_start_of_fade_out(h5) tatums_confidence = hdf5_getters.get_tatums_confidence(h5) tatums_start = hdf5_getters.get_tatums_start(h5) tempo = hdf5_getters.get_tempo(h5) time_signature = hdf5_getters.get_time_signature(h5) time_signature_confidence = hdf5_getters.get_time_signature_confidence(h5) title = hdf5_getters.get_title(h5) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) track_id = hdf5_getters.get_track_id(h5) year = hdf5_getters.get_year(h5) h5.close() writer.writerow({ 'artist_mbid': artist_mbid, 'artist_mbtags': artist_mbtags, 'artist_name': artist_name, 'artist_playmeid': artist_playmeid, 'artist_terms': artist_terms, 'artist_terms_freq': artist_terms_freq, 'artist_terms_weight': artist_terms_weight, 'audio_md5': audio_md5, 'bars_confidence': bars_confidence,
num_songs = len(songs) perc_i = 0 for song in songs: if songs.index(song) * 10 / num_songs > perc_i: print(str(perc_i * 10) + "% done.") perc_i = perc_i + 1 h5 = hdf5_getters.open_h5_file_read(song) track_id = str(hdf5_getters.get_song_id(h5), "utf-8") artist = str(hdf5_getters.get_artist_name(h5), "utf-8") title = str(hdf5_getters.get_title(h5), "utf-8") loudness = float(hdf5_getters.get_loudness(h5)) release_year = int(hdf5_getters.get_year(h5)) tempo = float(hdf5_getters.get_tempo(h5)) danceability = float(hdf5_getters.get_danceability(h5)) tags = hdf5_getters.get_artist_mbtags(h5) tags = tags.tolist() tags_refined = [] for tag in tags: tags_refined.append(str(tag, "utf-8"))
# sanity checks if SPOTIFY_API_KEY is None: print ('You need to set a 7digital API key!') print ('Get one at: http://developer.7digital.net/') print ('Pass it as a flag: -7digitalkey KEY') print ('or set it under environment variable: SPOTIFY_API_KEY') sys.exit(0) if not os.path.isfile(h5path): print ('invalid path (not a file):',h5path) sys.exit(0) # open h5 song, get all we know about the song h5 = hdf5_utils.open_h5_file_read(h5path) artist_name = GETTERS.get_artist_name(h5).decode('utf-8') track_name = GETTERS.get_title(h5).decode('utf-8') h5.close() print('Searching for track: ', artist_name, ' - ', track_name) #search by artist name + track title if res is None: print( 'Did not find track using artist name and track title') else: res = get_trackid_from_text_search(track_name, artistname=artist_name) name, preview_url = res print(name) # sys.exit(0) # closest_track,trackid = res # if closest_track != track_name: # print(( 'we approximate your song title:',track_name,'by:',closest_track)) # preview = get_preview_from_trackid(trackid)
track_id = os.path.splitext(track_name)[0] track_id_to_info[track_id] = None print(len(track_id_to_info)) h5 = hdf5_utils.open_h5_file_read(h5path) num_songs = GETTERS.get_num_songs(h5) print('Retrieving meta data from hdf5 file...') for i in tqdm(range(num_songs)): track_id = GETTERS.get_track_id(h5, songidx=i).decode('utf-8') if track_id in track_id_to_info: artist_name = GETTERS.get_artist_name(h5, songidx=i) track_name = GETTERS.get_title(h5, songidx=i) year = GETTERS.get_year(h5, songidx=i) tempo = GETTERS.get_tempo(h5, songidx=i) info_dict = { 'artist_name': artist_name, 'track_name': track_name, 'year': year, 'tempo': tempo } track_id_to_info[track_id] = info_dict pickle.dump(track_id_to_info, open('../track_id_to_info.pkl', 'wb')) h5.close()