def feat_from_file(path): """ Extract a list of features in an array, already converted to string """ feats = [] h5 = GETTERS.open_h5_file_read(path) # basic info feats.append(GETTERS.get_track_id(h5)) feats.append(GETTERS.get_artist_name(h5).decode().replace(',', '')) feats.append(GETTERS.get_title(h5).decode().replace(',', '')) feats.append(GETTERS.get_loudness(h5)) feats.append(GETTERS.get_tempo(h5)) feats.append(GETTERS.get_time_signature(h5)) feats.append(GETTERS.get_key(h5)) feats.append(GETTERS.get_mode(h5)) feats.append(GETTERS.get_duration(h5)) # timbre timbre = GETTERS.get_segments_timbre(h5) avg_timbre = np.average(timbre, axis=0) for k in avg_timbre: feats.append(k) var_timbre = np.var(timbre, axis=0) for k in var_timbre: feats.append(k) # done with h5 file h5.close() # makes sure we return strings feats = map(lambda x: str(x), feats) return feats
def load_non_time_data(): years = [] ten_features=[] num = 0 for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: h5 = getter.open_h5_file_read(f) num += 1 print(num) try: year = getter.get_year(h5) if year!=0: years.append(year) title_length = len(getter.get_title(h5)) terms_length = len(getter.get_artist_terms(h5)) tags_length = len(getter.get_artist_mbtags(h5)) hotness = getter.get_artist_hotttnesss(h5) duration = getter.get_duration(h5) loudness = getter.get_loudness(h5) mode = getter.get_mode(h5) release_length = len(getter.get_release(h5)) tempo = getter.get_tempo(h5) name_length = len(getter.get_artist_name(h5)) ten_feature = np.hstack([title_length,tags_length, hotness, duration, terms_length, loudness, mode, release_length, tempo, name_length]) ten_features.append(ten_feature) except: print(1) h5.close() return years,ten_features
def get_key_tempo(filename): h5 = GETTERS.open_h5_file_read(filename) tempo = GETTERS.get_tempo(h5) key = GETTERS.get_key(h5) ar = GETTERS.get_artist_name(h5) title = GETTERS.get_title(h5) st = "" terms = None try: a = artist.Artist(str(ar)) terms = a.get_terms() time.sleep(.12) except EchoNestIOError as e: print "echonestIOerror" except EchoNestAPIError as e: if e.code == 3: time.sleep(1) elif e.code == 5: print "code is 5" else: print "error.." if terms: print terms[0]['name'] with open('points.csv', 'a') as fp: a = csv.writer(fp, delimiter=',') a.writerow([tempo, key, ar, title, terms[0]['name']]) h5.close()
def insert_singer(): print('Inserting singer tuples') conn = get_conn() cursor = get_cursor(conn) unique = set() __name = None __singer_id = None __initial_hotness = None try: for i in range(hard.NUM_SINGERS): __singer_id = bytes2str(GETTERS.get_artist_id(h5, i)) if __singer_id not in unique: unique.add(__singer_id) __name = bytes2str(GETTERS.get_artist_name(h5, i)) __initial_hotness = 0 cursor.execute(sql.INSERT_SINGER, id = __singer_id, name = __name, hotness = __initial_hotness) return 0 except Exception as e: print(e, 'insert singer tuple error') print('name:',__name, 'singer_id:',__singer_id, 'hotness:',__initial_hotness) return -1 finally: conn.commit() close_all(conn, cursor)
def debug_from_song_file(connect, h5path, verbose=0): """ Slow debugging function that takes a h5 file, reads the info, check the match with musicbrainz db, prints out the result. Only prints when we dont get exact match! RETURN counts of how many files we filled for years, tags """ import hdf5_utils as HDF5 import hdf5_getters as GETTERS h5 = HDF5.open_h5_file_read(h5path) title = GETTERS.get_title(h5) release = GETTERS.get_release(h5) artist = GETTERS.get_artist_name(h5) ambid = GETTERS.get_artist_mbid(h5) h5.close() # mbid gotmbid = 1 if ambid == '': gotmbid = 0 if verbose > 0: print('no mb id for:', artist) # year year = find_year_safemode(connect, ambid, title, release, artist) gotyear = 1 if year > 0 else 0 if verbose > 0: print('no years for:', artist, '|', release, '|', title) # tags tags, counts = get_artist_tags(connect, ambid) gottags = 1 if len(tags) > 0 else 0 if gottags == 0 and verbose > 0: print('no tags for:', artist) # return indicator for mbid, year, tag return gotmbid, gotyear, gottags
def feat_from_file(path): feats = [] h5 = GETTERS.open_h5_file_read(path) feats.append( GETTERS.get_track_id(h5) ) feats.append( GETTERS.get_title(h5) ) feats.append( GETTERS.get_artist_name(h5) ) feats.append( GETTERS.get_year(h5) ) feats.append( GETTERS.get_loudness(h5) ) feats.append( GETTERS.get_tempo(h5) ) feats.append( GETTERS.get_time_signature(h5) ) feats.append( GETTERS.get_key(h5) ) feats.append( GETTERS.get_mode(h5) ) feats.append( GETTERS.get_duration(h5) ) #timbre timbre = GETTERS.get_segments_timbre(h5) avg_timbre = np.average(timbre, axis=0) for k in avg_timbre: feats.append(k) var_timbre = np.var(timbre, axis=0) for k in var_timbre: feats.append(k) h5.close() return feats
def fetch_song_from_h5(h5_filepath): basename, extension = os.path.splitext(h5_filepath) if extension != '.h5': return audio_filepath = basename + '.mp3' h5 = hdf5_getters.open_h5_file_read(h5_filepath) track_id = hdf5_getters.get_track_7digitalid(h5) track_name = hdf5_getters.get_title(h5) artist_name = hdf5_getters.get_artist_name(h5) h5.close() consumer = oauth.Consumer(OAUTH_CLIENT_KEY, OAUTH_CLIENT_SECRET) token = oauth.Token(OAUTH_ACCESS_TOKEN, OAUTH_ACCESS_SECRET) request = oauth.Request.from_consumer_and_token( consumer, http_url=get_clip_url(track_id), is_form_encoded=True, parameters={'country': 'ww'}) signing_method = oauth.SignatureMethod_HMAC_SHA1() request.sign_request(signing_method, consumer, token) url = request.to_url() r = requests.get(url) if r.status_code not in (requests.codes.ok, requests.codes.not_found): print(r.status_code, r.headers, r.content) exit() if r.status_code == requests.codes.ok: print('FETCHED track {0} {1} {2}'.format( track_id, artist_name, track_name)) with open(audio_filepath, 'wb') as f: f.write(r.content) else: print('FAILED TO FETCH track {0} {1} {2}'.format( track_id, artist_name, track_name))
def debug_from_song_file(connect,h5path,verbose=0): """ Slow debugging function that takes a h5 file, reads the info, check the match with musicbrainz db, prints out the result. Only prints when we dont get exact match! RETURN counts of how many files we filled for years, tags """ import hdf5_utils as HDF5 import hdf5_getters as GETTERS h5 = HDF5.open_h5_file_read(h5path) title = GETTERS.get_title(h5) release = GETTERS.get_release(h5) artist = GETTERS.get_artist_name(h5) ambid = GETTERS.get_artist_mbid(h5) h5.close() # mbid gotmbid=1 if ambid=='': gotmbid = 0 if verbose>0: print('no mb id for:',artist) # year year = find_year_safemode(connect,ambid,title,release,artist) gotyear = 1 if year > 0 else 0 if verbose>0: print('no years for:',artist,'|',release,'|',title) # tags tags,counts = get_artist_tags(connect,ambid) gottags = 1 if len(tags) > 0 else 0 if gottags == 0 and verbose>0: print('no tags for:',artist) # return indicator for mbid, year, tag return gotmbid,gotyear,gottags
def get_all_titles(basedir,ext='.h5') : titles = [] artist_names = [] terms = [] loudness = [] segments_loudness_max = [] for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: h5 = hdf5_getters.open_h5_file_read(f) titles.append(hdf5_getters.get_title(h5)) artist_names.append(hdf5_getters.get_artist_name(h5)) try: terms.append(hdf5_getters.get_artist_terms(h5)) except: pass loudness.append(hdf5_getters.get_loudness(h5)) try: segments_loudness_max.append(hdf5_getters.get_segments_loudness_max(h5)) except: pass h5.close() return titles, artist_names, terms, loudness, segments_loudness_max
def create_labels(songs, sp): # Songs now holds list of all file paths to each song as a string """ Goes through all songs in list to find the danceability from spotify param: the list of song absolute file path names, and the Spotify object to use to make calls returns: a [num_songs] array of danceability labels (-1,0,1) """ print("creating labels...") acc = 0 labels = [] broken_labels = [] for i in range(1,len(songs)): print(i) file_object= hdf.open_h5_file_read(songs[i]) artist_name = hdf.get_artist_name(file_object).decode("utf-8") title = re.sub(r"\(.*\)","",hdf.get_title(file_object).decode("utf-8")) query = "artist: " + artist_name + " track: " + title label = get_danceability(query, sp) if label != -1: labels.append(label) else: broken_labels.append(i) acc+=1 file_object.close() print("NUMBER OF LOST SONGS = ", acc) return np.array(labels, dtype=np.int32), np.array(broken_labels, dtype=np.int32)
def process_song(self, song_path): song_data = h5.open_h5_file_read(song_path) song_id = h5.get_song_id(song_data).decode('UTF-8') song_int_id = int(h5.get_track_7digitalid(song_data)) song_name = h5.get_title(song_data).decode('UTF-8').lower() artist_name = h5.get_artist_name(song_data).decode('UTF-8').lower() song_year = int(h5.get_year(song_data)) timbre = self.ndarray_list_to_ndlist(h5.get_segments_timbre(song_data)) chroma = self.ndarray_list_to_ndlist( h5.get_segments_pitches(song_data)) song_data.close() song_dict = { 'id': song_int_id, 'source_id': song_id, 'name': song_name, 'artist': artist_name, 'year': song_year, 'timbre': timbre, 'chroma': chroma } return song_dict
def get_info(basedir,ext='.h5') : # Create new text file for storing the result of JSON objects resultFile = open("result.txt", "w") # Going through all sub-directories under the base directory for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: # Open the HDF5 for reading the content h5 = hdf5_getters.open_h5_file_read(f) # Creating dictionary to convert to JSON object dictionary = {} # Storing all fields dictionary["song_title"] = hdf5_getters.get_title(h5).decode('Latin-1') dictionary["artist_name"] = hdf5_getters.get_artist_name(h5).decode('Latin-1') dictionary["key"] = float(hdf5_getters.get_key(h5)) dictionary["minor-major"] = float(hdf5_getters.get_mode(h5)) dictionary["hotness"] = hdf5_getters.get_song_hotttnesss(h5) dictionary["artist_location"] = hdf5_getters.get_artist_location(h5).decode('Latin-1') dictionary["longitude"] = float(hdf5_getters.get_artist_longitude(h5)) dictionary["latitude"] = float(hdf5_getters.get_artist_latitude(h5)) print(dictionary) # Write the created JSON object to the text file resultFile.write(str(json.dumps(dictionary)) + "\n") h5.close() resultFile.close()
def process_song(self, song_path): # read file song_data = h5.open_h5_file_read(song_path) # process file #song_id = h5.get_song_id(song_data).decode('UTF-8') song_int_id = int(h5.get_track_7digitalid(song_data)) song_name = h5.get_title(song_data).decode('UTF-8').lower() artist_name = h5.get_artist_name(song_data).decode('UTF-8').lower() song_year = int(h5.get_year(song_data)) sp = SpotifyInterface() track_info = sp.search_track_info(artist_name, song_name) if track_info == None: song_data.close() return None timbre = self.ndarray_list_to_ndlist(h5.get_segments_timbre(song_data)) chroma = self.ndarray_list_to_ndlist(h5.get_segments_pitches(song_data)) song_data.close() song_dict = {'id': song_int_id, 'name': song_name, 'artist': artist_name, 'year': song_year, 'timbre': timbre, 'chroma': chroma, **track_info} return song_dict
def traverseAndWrite(root, genreDirs, genreKeys): if not isfile(root): for f in listdir(root): traverseAndWrite(root + "/" + f,genreDirs, genreKeys) else: h5 = hdf5_getters.open_h5_file_read(root) numOfSongs = hdf5_getters.get_num_songs(h5) for index in range(numOfSongs): tags = hdf5_getters.get_artist_mbtags(h5,index) # print tags artist = hdf5_getters.get_artist_name(h5,index) songName = hdf5_getters.get_title(h5,index) segmentTimbre = hdf5_getters.get_segments_timbre(h5,index) segmentPitches = hdf5_getters.get_segments_pitches(h5,index) if notValidSong(tags, artist, songName, segmentTimbre, segmentPitches): h5.close() continue for genre in genreKeys: if genreInTags(genre,tags): song = {} song['genre'] = genre song['artist_name'] = artist song['song_title'] = songName song['segments_pitches'] = segmentPitches.tolist() song['segments_timbre'] = segmentTimbre.tolist() valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) songName = ''.join(c for c in songName if c in valid_chars) artist = ''.join(c for c in artist if c in valid_chars) fd = open(genreDirs[genre]+"/"+artist+"--"+songName+".json",'a') writeToDescriptor(fd,song) fd.close() h5.close()
def func_to_desired_song_data(filename): h5 = GETTERS.open_h5_file_read(filename) track_id = GETTERS.get_track_id(h5) for song in random_songs: if song[0] == track_id: print("FOUND ONE!") title = replace_characters(GETTERS.get_title(h5)) artist = replace_characters(GETTERS.get_artist_name(h5)) year = GETTERS.get_year(h5) tempo = GETTERS.get_tempo(h5) key = GETTERS.get_key(h5) loudness = GETTERS.get_loudness(h5) energy = GETTERS.get_energy(h5) danceability = GETTERS.get_danceability(h5) time_signature = GETTERS.get_time_signature(h5) mode = GETTERS.get_mode(h5) hotttness = GETTERS.get_song_hotttnesss(h5) song_data = { 'title': title, 'artist': artist, 'year': year, 'tempo': tempo, 'key': key, 'loudness': loudness, 'energy': energy, 'danceability': danceability, 'time_signature': time_signature, 'mode': mode, 'hotttness': hotttness } all_the_data.append(song_data) h5.close()
def feat_from_file(path): """ Extract a list of features in an array, already converted to string """ feats = [] h5 = GETTERS.open_h5_file_read(path) # basic info feats.append( GETTERS.get_track_id(h5) ) feats.append( GETTERS.get_artist_name(h5).replace(',','') ) feats.append( GETTERS.get_title(h5).replace(',','') ) feats.append( GETTERS.get_loudness(h5) ) feats.append( GETTERS.get_tempo(h5) ) feats.append( GETTERS.get_time_signature(h5) ) feats.append( GETTERS.get_key(h5) ) feats.append( GETTERS.get_mode(h5) ) feats.append( GETTERS.get_duration(h5) ) # timbre timbre = GETTERS.get_segments_timbre(h5) avg_timbre = np.average(timbre,axis=0) for k in avg_timbre: feats.append(k) var_timbre = np.var(timbre,axis=0) for k in var_timbre: feats.append(k) # done with h5 file h5.close() # makes sure we return strings feats = map(lambda x: str(x), feats) return feats
def get_all_examples(basedir, genre_dict, ext='.h5'): """ From a base directory, goes through all subdirectories, and grabs all songs and their features and puts them into a pandas dataframe INPUT basedir - base directory of the dataset genre_dict - a dictionary mapping track id to genre based tagraum dataset ext - extension, .h5 by default RETURN dataframe containing all song examples """ features_vs_genre = pd.DataFrame() # iterate over all files in all subdirectories for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) # # count files # count += len(files) # apply function to all files for f in files: h5 = GETTERS.open_h5_file_read(f) song_id = GETTERS.get_track_id(h5).decode('utf-8') if (song_id in genre_dict): genre = genre_dict[song_id] year = GETTERS.get_year(h5) duration = GETTERS.get_duration(h5) end_of_fade_in = GETTERS.get_end_of_fade_in(h5) loudness = GETTERS.get_loudness(h5) song_hotttnesss = GETTERS.get_song_hotttnesss(h5) tempo = GETTERS.get_tempo(h5) key = GETTERS.get_key(h5) key_confidence = GETTERS.get_key_confidence(h5) mode = GETTERS.get_mode(h5) mode_confidence = GETTERS.get_mode_confidence(h5) time_signature = GETTERS.get_time_signature(h5) time_signature_confidence = GETTERS.get_time_signature_confidence( h5) artist_name = GETTERS.get_artist_name(h5) title = GETTERS.get_title(h5) # length of sections_start array gives us number of start num_sections = len(GETTERS.get_sections_start(h5)) num_segments = len(GETTERS.get_segments_confidence(h5)) example = pd.DataFrame( data=[ (artist_name, title, song_id, genre, year, key, key_confidence, mode, mode_confidence, time_signature, time_signature_confidence, duration, end_of_fade_in, loudness, song_hotttnesss, tempo, num_sections) ], columns=[ 'artist_name', 'title', 'song_id', 'genre', 'year', 'key', 'key_confidence', 'mode', 'mode_confidence', 'time_signature', 'time_signature_confidence', 'duration', 'end_of_fade_in', 'loudness', 'song_hotttnesss', 'tempo', 'num_segments' ]) features_vs_genre = features_vs_genre.append(example) h5.close() return features_vs_genre
def get_attribute(files): array = [] count = 0 for f in files: temp = [] count += 1 print(f) h5 = hdf5_getters.open_h5_file_read(f) temp.append(hdf5_getters.get_num_songs(h5)) temp.append(hdf5_getters.get_artist_familiarity(h5)) temp.append(hdf5_getters.get_artist_hotttnesss(h5)) temp.append(hdf5_getters.get_danceability(h5)) temp.append(hdf5_getters.get_energy(h5)) temp.append(hdf5_getters.get_key(h5)) temp.append(hdf5_getters.get_key_confidence(h5)) temp.append(hdf5_getters.get_loudness(h5)) temp.append(hdf5_getters.get_mode(h5)) temp.append(hdf5_getters.get_mode_confidence(h5)) temp.append(hdf5_getters.get_tempo(h5)) temp.append(hdf5_getters.get_time_signature(h5)) temp.append(hdf5_getters.get_time_signature_confidence(h5)) temp.append(hdf5_getters.get_title(h5)) temp.append(hdf5_getters.get_artist_name(h5)) temp = np.nan_to_num(temp) array.append(temp) # if count%100 ==0: # print(array[count-100:count-1]) # kmean.fit(array[count-100:count-1]) h5.close() return array
def get_all_titles(basedir, ext='.h5'): titles = [] artist_names = [] terms = [] loudness = [] segments_loudness_max = [] for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: h5 = hdf5_getters.open_h5_file_read(f) titles.append(hdf5_getters.get_title(h5)) artist_names.append(hdf5_getters.get_artist_name(h5)) try: terms.append(hdf5_getters.get_artist_terms(h5)) except: pass loudness.append(hdf5_getters.get_loudness(h5)) try: segments_loudness_max.append( hdf5_getters.get_segments_loudness_max(h5)) except: pass h5.close() return titles, artist_names, terms, loudness, segments_loudness_max
def song_to_artist(if_str): songs_tracks = pickle.load(open ("../../msd_dense_subset/dense/songs_tracks.pkl",'r')); track = str(songs_tracks[if_str]) # build path path = "../../msd_dense_subset/dense/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5" h5 = GETTERS.open_h5_file_read(path) artist = GETTERS.get_artist_name(h5) h5.close() return artist
def hdf5_to_features(file_name): """ Receives path to HDF5 file, returns 2 lists of identification for the song as well as the features for the algorithm. Parameters ---------- file_name : str Absolute path to the HDF5 file. Returns ------- list1 : list List consisting of ID, song title and artist name. list2 : list 34 features to represent the song. """ with hdf5_getters.open_h5_file_read(file_name) as reader: # ID ID = hdf5_getters.get_song_id(reader) title = hdf5_getters.get_title(reader) artist = hdf5_getters.get_artist_name(reader) # Features 1-4 beat_starts = hdf5_getters.get_beats_start(reader) beat_durations = np.diff(beat_starts, axis=0) # try: tempo_10, tempo_90 = np.quantile(beat_durations, [0.1, 0.9]) # except: # print(beat_durations) # exit() temp_var = np.var(beat_durations) temp_mean = np.mean(beat_durations) # Features 5-8 segment_loudness = hdf5_getters.get_segments_loudness_max(reader) loud_10, loud_90 = np.quantile(segment_loudness, [0.1, 0.9]) loud_var = np.var(segment_loudness) loud_mean = np.mean(segment_loudness) # Features 9-21 pitch_dominance = hdf5_getters.get_segments_pitches(reader) pitch_means = pitch_dominance.mean(axis=0) pitch_var = pitch_means.var() # Features 22-34 timbre = hdf5_getters.get_segments_timbre(reader) timbre_means = timbre.mean(axis=0) timbre_var = timbre_means.var() return [ID, title, artist], [ tempo_10, tempo_90, temp_var, temp_mean, loud_10, loud_90, loud_var, loud_mean ] + list(pitch_means) + [pitch_var] + list(timbre_means) + [timbre_var]
def main(): # print("we in") outputFile1 = open('../Datasets/MSDSubsetCSV.csv', 'w') csvRowString = "" csvRowString = "Title,ArtistName" csvAttributeList = re.split(',', csvRowString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() csvRowString += ",\n" basedir = '/Users/Owner/Desktop/School/2019-2020/COMP400/MillionSongSubset/' ext = ".h5" #FOR LOOP for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: print(f) songH5File = hdf5_getters.open_h5_file_read(f) song = Song(str(hdf5_getters.get_song_id(songH5File))) song.title = str(hdf5_getters.get_title(songH5File)).replace( "b'", "").lower() song.artistName = str( hdf5_getters.get_artist_name(songH5File)).replace("b'", "").lower() song.year = str(hdf5_getters.get_year(songH5File)) if (int(song.year) < 1990): print('nope', int(song.year)) continue for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'ArtistName'.lower(): csvRowString += "\"" + song.artistName.replace( "'", "") + "\"" #took out "\"" before and after elif attribute == 'Title'.lower(): csvRowString += "\"" + song.title.replace("'", "") + "\"" else: csvRowString += "Erm. This didn't work. Error. :( :(\n" csvRowString += "," #Remove the final comma from each row in the csv lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex - 1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" songH5File.close() outputFile1.close()
def func_to_get_artist_name(filename): """ This function does 3 simple things: - open the song file - get artist ID and put it - close the file """ h5 = GETTERS.open_h5_file_read(filename) artist_name = GETTERS.get_artist_name(h5) all_artist_names.add( artist_name ) h5.close()
def get_track_info(track,h5=None): #get song and artist of the track close = (h5== None) if h5==None: path = "../../msd_dense_subset/mood/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5" h5 = GETTERS.open_h5_file_read(path) artist = GETTERS.get_artist_name(h5) title = GETTERS.get_title(h5) if close: h5.close() return str(artist) + '-' + str(title)
def better_MSD_sample_dirslist(paths): """ get list of filenames, artist, song title for all h5 files in a list of MSD sample directories """ dirdata = [] for path in paths: dirlist = os.listdir(path) for fname in dirlist: with GETTERS.open_h5_file_read(path+fname) as h5: dirdata.append([fname, GETTERS.get_artist_name(h5),GETTERS.get_title(h5)]) return dirdata
def func_to_get_instrumental(filename): h5 = GETTERS.open_h5_file_read(filename) tags = set(GETTERS.get_artist_mbtags(h5)) genres = {'classical', 'orchestral'} if tags.intersection(genres): d = {} d['artist'] = GETTERS.get_artist_name(h5) d['title'] = GETTERS.get_title(h5) song_id = GETTERS.get_song_id(h5) classical[song_id] = d h5.close()
def song_to_artist(if_str): songs_tracks = pickle.load( open("../../msd_dense_subset/dense/songs_tracks.pkl", 'r')) track = str(songs_tracks[if_str]) # build path path = "../../msd_dense_subset/dense/" + track[2] + "/" + track[ 3] + "/" + track[4] + "/" + track + ".h5" h5 = GETTERS.open_h5_file_read(path) artist = GETTERS.get_artist_name(h5) h5.close() return artist
def get_track_info(track, h5=None): #get song and artist of the track close = (h5 == None) if h5 == None: path = "../../../msd_dense_subset/dense/" + track[2] + "/" + track[ 3] + "/" + track[4] + "/" + track + ".h5" h5 = GETTERS.open_h5_file_read(path) artist = GETTERS.get_artist_name(h5) title = GETTERS.get_title(h5) if close: h5.close() return str(artist) + '-' + str(title)
def extract_data(filename): h5 = GETTERS.open_h5_file_read(filename) track_id = GETTERS.get_song_id(h5) if track_id in already: h5.close() return songdata[track_id].append(GETTERS.get_title(h5)) songdata[track_id].append(GETTERS.get_artist_name(h5)) songdata[track_id].append(GETTERS.get_duration) h5.close()
def get_all_titles(basedir, ext='.h5'): for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: h5 = hdf5_getters.open_h5_file_read(f) title = (hdf5_getters.get_title(h5)) title = re.sub('[^A-Za-z0-9 ]+', '', title) name = hdf5_getters.get_artist_name(h5) h5.close() x, _, _ = ws.billboard(title, name) print title, x
def MSD_sample_dirlist(path): """ get list of filenames, artist, song title for all h5 files in an MSD sample directory """ dirpath = path dirlist = os.listdir(path) dirdata = [] for fname in dirlist: h5 = GETTERS.open_h5_file_read(dirpath+fname) dirdata.append([fname, GETTERS.get_artist_name(h5),GETTERS.get_title(h5)]) h5.close() return dirdata
def getInfo(files): infoList = np.array(['tid', 'artist', 'song']) for fil in files: curFile = getter.open_h5_file_read(fil) tid = fil.split('/')[-1].split('.')[0] curArtist = getter.get_artist_name(curFile) curTitle = getter.get_title(curFile) curArr = np.array([tid, curArtist, curTitle]) infoList = np.vstack([infoList, curArr]) curFile.close() return infoList
def songinfo(if_str): songs_tracks = pickle.load(open ("../../msd_dense_subset/dense/songs_tracks.pkl",'r')); track = str(songs_tracks[if_str]) # build path path = "../../msd_dense_subset/dense/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5" h5 = GETTERS.open_h5_file_read(path) artist_name = GETTERS.get_artist_name(h5) song_name = GETTERS.get_title(h5) year = GETTERS.get_year(h5, 0) #segments = GETTERS.get_segments_start(h5, 0); #segments_pitches = GETTERS.get_segments_pitches(h5, 0) h5.close() return artist_name+ " - " +song_name + " (" +str(year) +")"
def _extractSongData(file_path, filename): # song_id, title, release, artist_name, year h5 = hdf5_getters.open_h5_file_read(file_path) track_id = filename[:-3] song_id = hdf5_getters.get_song_id(h5).decode('UTF-8') dig7_id = hdf5_getters.get_track_7digitalid(h5) title = hdf5_getters.get_title(h5).decode('UTF-8') release = hdf5_getters.get_release(h5).decode('UTF-8') artist_name = hdf5_getters.get_artist_name(h5).decode('UTF-8') year = hdf5_getters.get_year(h5) h5.close() # print(song_id, track_id, dig7_id, title, release, artist_name, year) return track_id, song_id, dig7_id, title, release, artist_name, year
def getArtistNameAndSongName(filename): """ This function does 3 simple things: - open the song file - get artist ID and put it - close the file """ h5 = GETTERS.open_h5_file_read(filename) artist_name = GETTERS.get_artist_name(h5) song_name = GETTERS.get_title(h5) song_id = GETTERS.get_song_id(h5) songsAndArtists[str(song_id)] = tuple((artist_name, song_name)) h5.close()
def getArtistNameAndSongName(filename): """ This function does 3 simple things: - open the song file - get artist ID and put it - close the file """ h5 = GETTERS.open_h5_file_read(filename) artist_name = GETTERS.get_artist_name(h5) song_name = GETTERS.get_title(h5) song_id = GETTERS.get_song_id(h5) songsAndArtists[str(song_id)] = tuple((artist_name,song_name)) h5.close()
def func_to_get_desired_values(filename, returnValue = False): """ This function does 3 simple things: - open the song file - get the elements we want and put them in - close the file INPUT : filename - The name of the h5 file to be loaded """ global all_desired_data # Open file h5 = GETTERS.open_h5_file_read(filename) # Create and fill a record record = [] for element in elementsRequested: result = getattr(GETTERS, element)(h5) try: if result == '': result = 'Adlen - void' except: pass try: if isinstance(result, np.ndarray): if len(result) > 1: result = float(np.mean(result)) else: result = '' except: try: result = float(result) except: pass record.append(result) song_id = GETTERS.get_track_id(h5) artist_name = GETTERS.get_artist_name(h5) title = GETTERS.get_title(h5) artist_mbtags = GETTERS.get_artist_mbtags(h5) release = GETTERS.get_release(h5) song_id = unicode(song_id.decode('utf-8')) title = unicode(title.decode('utf-8')) artist_name = unicode(artist_name.decode('utf-8')) if not returnValue: all_desired_data.append([[[song_id, title, artist_name, elementsRequested], artist_name, title, artist_mbtags, release], record]) h5.close() if returnValue: return [[[song_id, title, artist_name, elementsRequested], artist_name, title, artist_mbtags, release], record]
def songinfo(if_str): songs_tracks = pickle.load( open("../../msd_dense_subset/dense/songs_tracks.pkl", 'r')) track = str(songs_tracks[if_str]) # build path path = "../../msd_dense_subset/dense/" + track[2] + "/" + track[ 3] + "/" + track[4] + "/" + track + ".h5" h5 = GETTERS.open_h5_file_read(path) artist_name = GETTERS.get_artist_name(h5) song_name = GETTERS.get_title(h5) year = GETTERS.get_year(h5, 0) #segments = GETTERS.get_segments_start(h5, 0); #segments_pitches = GETTERS.get_segments_pitches(h5, 0) h5.close() return artist_name + " - " + song_name + " (" + str(year) + ")"
def MSD_sample_dirlist_save(path,file_path): """ get list of filenames, artist, song title for all h5 files in an MSD sample directory and save to csv """ import csv dirpath = path dirlist = os.listdir(path) dirdata = [] for fname in dirlist: h5 = GETTERS.open_h5_file_read(dirpath+fname) dirdata.append([dirpath, fname, GETTERS.get_artist_name(h5),GETTERS.get_title(h5)]) h5.close() listwriter = csv.writer(open(file_path,'a'), delimiter=',',quotechar='|',quoting=csv.QUOTE_MINIMAL) listwriter.writerows(dirdata) return dirdata
def dump_pitches(rootdir = './data/', filename = 'pitches.p'): pitches = [] tags = [] for subdir, dirs, files in os.walk(rootdir): for f in files: if f.lower().endswith('.h5'): h5f = hdf5_getters.open_h5_file_read(os.path.join(subdir, f)) seg_ptcs = hdf5_getters.get_segments_pitches(h5f) if 500 <= seg_ptcs.shape[0] <= 1500: pitches.append(seg_ptcs) tags.append('%s - %s - %s - %s' % (hdf5_getters.get_artist_name(h5f), hdf5_getters.get_title(h5f), hdf5_getters.get_year(h5f), hdf5_getters.get_tempo(h5f))) h5f.close() pickle.dump( pitches, open( filename, 'wb' ) ) pickle.dump( tags, open( 'tags.p', 'wb' ) ) print 'Saved {} pitches.'.format(len(pitches))
def map_artists_for_users(): users_artists = dict() songs_tracks = pickle.load(open ("../msd_dense_subset/dense/songs_tracks.pkl",'r')); for user in users_songs: print user users_artists[user] = set() for song in users_songs[user]: track = str(songs_tracks[song]) # build path path = "../msd_dense_subset/dense/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5" h5 = GETTERS.open_h5_file_read(path) users_artists[user].add(GETTERS.get_artist_name(h5)) h5.close() #store in pickle file for the moment with open(USERS_ARTIST_FILE, 'w') as f: pickle.dump(users_artists, f, pickle.HIGHEST_PROTOCOL) print "data saved to %s" % USERS_ARTIST_FILE
def get_url(h5_file): artist_name = GETTERS.get_artist_name(h5_file) track_name = GETTERS.get_title(h5_file) echo_nest_id = GETTERS.get_track_id(h5_file).lower() if echo_nest_id >= 0: preview = get_preview_from_trackid(echo_nest_id) if preview != '': return preview res = get_trackid_from_text_search(track_name, artistname=artist_name) if len(res) > 0: closest_track = get_closest_track(res, track_name) preview = get_preview_from_trackid(closest_track['id']) return preview return None
def process_song(h5_song_file): song = {} song['artist_familiarity'] = hdf5_getters.get_artist_familiarity(h5) song['artist_id'] = hdf5_getters.get_artist_id(h5) song['artist_name'] = hdf5_getters.get_artist_name(h5) song['artist_hotttnesss'] = hdf5_getters.get_artist_hotttnesss(h5); song['title'] = hdf5_getters.get_title(h5) terms = hdf5_getters.get_artist_terms(h5) terms_freq = hdf5_getters.get_artist_terms_freq(h5) terms_weight = hdf5_getters.get_artist_terms_weight(h5) terms_array = [] # Creating a array of [term, its frequency, its weight]. Doing this for all terms associated # with the artist for i in range(len(terms)): terms_array.append([terms[i], terms_freq[i], terms_weight[i]]) song['artist_terms'] = terms_array beats_start = hdf5_getters.get_beats_start(h5) song['beats_start_variance'] = variance(beats_start) #beats variance in yocto seconds(10^-24s) song['number_of_beats'] = len(beats_start) song['duration'] = hdf5_getters.get_duration(h5) song['loudness'] = hdf5_getters.get_loudness(h5) sections_start = hdf5_getters.get_sections_start(h5) song['sections_start_variance'] = variance(sections_start) song['number_of_sections'] = len(sections_start) segments_pitches = hdf5_getters.get_segments_pitches(h5) (a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) = split_segments(segments_pitches) song['segments_pitches_variance'] = [variance(a0), variance(a1), variance(a2), variance(a3), variance(a4), variance(a5), variance(a6), variance(a7), variance(a8), variance(a9), variance(a10), variance(a11)] song['segments_pitches_mean'] = [mean(a0), mean(a1), mean(a2), mean(a3), mean(a4), mean(a5), mean(a6), mean(a7), mean(a8), mean(a9), mean(a10), mean(a11)] segments_timbre = hdf5_getters.get_segments_timbre(h5) (a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) = split_segments(segments_timbre) song['segments_timbre_variance'] = [variance(a0), variance(a1), variance(a2), variance(a3), variance(a4), variance(a5), variance(a6), variance(a7), variance(a8), variance(a9), variance(a10), variance(a11)] song['segments_timbre_mean'] = [mean(a0), mean(a1), mean(a2), mean(a3), mean(a4), mean(a5), mean(a6), mean(a7), mean(a8), mean(a9), mean(a10), mean(a11)] song['tempo'] = hdf5_getters.get_tempo(h5) song['_id'] = hdf5_getters.get_song_id(h5) song['year'] = hdf5_getters.get_year(h5) return song
def load_raw_data(): years = [] ten_features=[] timbres = [] pitches = [] min_length = 10000 num = 0 for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: h5 = getter.open_h5_file_read(f) num += 1 print(num) try: year = getter.get_year(h5) if year!=0: timbre = getter.get_segments_timbre(h5) s = np.size(timbre,0) if s>=100: if s<min_length: min_length = s pitch = getter.get_segments_pitches(h5) years.append(year) timbres.append(timbre) pitches.append(pitch) title_length = len(getter.get_title(h5)) terms_length = len(getter.get_artist_terms(h5)) tags_length = len(getter.get_artist_mbtags(h5)) hotness = getter.get_artist_hotttnesss(h5) duration = getter.get_duration(h5) loudness = getter.get_loudness(h5) mode = getter.get_mode(h5) release_length = len(getter.get_release(h5)) tempo = getter.get_tempo(h5) name_length = len(getter.get_artist_name(h5)) ten_feature = np.hstack([title_length, hotness, duration, tags_length, terms_length,loudness, mode, release_length, tempo, name_length]) ten_features.append(ten_feature) except: print(1) h5.close() return years, timbres, pitches,min_length,ten_features
def getInfo(files, genres, songs, topicNum): # Checks to see db song is in out subset, then adds it # Not the most efficient method infoList = np.zeros(topicNum + 4) for fil in files: for song in songs: if fil.split('/')[-1].split('.')[0] == song[1].split( '/')[-1].split('.')[0]: curFile = getter.open_h5_file_read(fil) tid = fil.split('/')[-1].split('.')[0] curArtist = getter.get_artist_name(curFile) curTitle = getter.get_title(curFile) curArr = np.array([tid, curArtist, curTitle]) infoList = np.vstack( [infoList, np.hstack([curArr, genres[tid], song[2:]])]) curFile.close() return infoList[1:]
def map_artists_for_users(): users_artists = dict() songs_tracks = pickle.load( open("../msd_dense_subset/dense/songs_tracks.pkl", 'r')) for user in users_songs: print user users_artists[user] = set() for song in users_songs[user]: track = str(songs_tracks[song]) # build path path = "../msd_dense_subset/dense/" + track[2] + "/" + track[ 3] + "/" + track[4] + "/" + track + ".h5" h5 = GETTERS.open_h5_file_read(path) users_artists[user].add(GETTERS.get_artist_name(h5)) h5.close() #store in pickle file for the moment with open(USERS_ARTIST_FILE, 'w') as f: pickle.dump(users_artists, f, pickle.HIGHEST_PROTOCOL) print "data saved to %s" % USERS_ARTIST_FILE
def h5_to_csv_fields(h5,song): '''Converts h5 format to text Inputs: h5, an h5 file object, usable with the wrapper code MSongsDB song, an integer, representing which song in the h5 file to take the info out of (h5 files contain many songs) Output: a string representing all the information of this song, as a single line of a csv file ''' rv=[] ##All these are regular getter functions from wrapper code rv.append(gt.get_artist_name(h5,song)) rv.append(gt.get_title(h5, song)) rv.append(gt.get_release(h5, song)) rv.append(gt.get_year(h5,song)) rv.append(gt.get_duration(h5,song)) rv.append(gt.get_artist_familiarity(h5,song)) rv.append(gt.get_artist_hotttnesss(h5,song)) rv.append(gt.get_song_hotttnesss(h5, song)) ##artist_terms, artist_terms_freq, and artist_terms_weight getter functions ##are all arrays, so we need to turn them into strings first. We used '_' as a separator rv.append(array_to_csv_field(list(gt.get_artist_terms(h5,song)))) rv.append(array_to_csv_field(list(gt.get_artist_terms_freq(h5,song)))) rv.append(array_to_csv_field(list(gt.get_artist_terms_weight(h5,song)))) rv.append(gt.get_mode(h5,song)) rv.append(gt.get_key(h5,song)) rv.append(gt.get_tempo(h5,song)) rv.append(gt.get_loudness(h5,song)) rv.append(gt.get_danceability(h5,song)) rv.append(gt.get_energy(h5,song)) rv.append(gt.get_time_signature(h5,song)) rv.append(array_to_csv_field(list(gt.get_segments_start(h5,song)))) ##These arrays have vectors (Arrays) as items, 12 dimensional each ##An array like [[1,2,3],[4,5,6]] will be written to csv as '1;2;3_4;5;6', i.e. there's two types of separators rv.append(double_Array_to_csv_field(list(gt.get_segments_timbre(h5,song)),'_',';')) rv.append(double_Array_to_csv_field(list(gt.get_segments_pitches(h5,song)),'_',';')) rv.append(array_to_csv_field(list(gt.get_segments_loudness_start(h5,song)))) rv.append(array_to_csv_field(list(gt.get_segments_loudness_max(h5,song)))) rv.append(array_to_csv_field(list(gt.get_segments_loudness_max_time(h5,song)))) rv.append(array_to_csv_field(list(gt.get_sections_start(h5,song)))) ##turn this list into a string with comma separators (i.e. a csv line) rv_string=array_to_csv_field(rv, ",") rv_string+="\n" return rv_string
def getTrackInfo(starting_num): my_list = [] f = hdf5_getters.open_h5_file_read(filepath) progress_bar = tqdm(range(tracks_per_thread)) for iteration in progress_bar: i = int(iteration) + (starting_num*tracks_per_thread) track_id = hdf5_getters.get_track_id(f, i).decode() if track_id not in lyric_track_ids_set: continue # skip it an go on artist_name = hdf5_getters.get_artist_name(f, i).decode() duration = hdf5_getters.get_duration(f, i) loudness = hdf5_getters.get_loudness(f, i) tempo = hdf5_getters.get_tempo(f, i) title = hdf5_getters.get_title(f, i).decode() year = hdf5_getters.get_year(f, i) long_list = [track_id, artist_name, duration, loudness, tempo, title, year] my_list.append(long_list) progress_bar.set_description("Iteration %d" % i) f.close() return my_list
def get_all_data(target, basedir, ext='.h5') : # header target.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( "track_id", "song_id", "title", "artist_name", "artist_location", "artist_hotttnesss", "release", "year", "song_hotttnesss", "danceability", "duration", "loudness", "sample_rate", "tempo" )) count = 0 for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: for line in f: new_file = open("tmp.txt", 'w') new_file.write(line) h5 = hdf5_getters.open_h5_file_read(new_file) target.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( hdf5_getters.get_track_id(h5), hdf5_getters.get_song_id(h5), hdf5_getters.get_title(h5), hdf5_getters.get_artist_name(h5), hdf5_getters.get_artist_location(h5), hdf5_getters.get_artist_hotttnesss(h5), hdf5_getters.get_release(h5), hdf5_getters.get_year(h5), hdf5_getters.get_song_hotttnesss(h5), hdf5_getters.get_danceability(h5), hdf5_getters.get_duration(h5), hdf5_getters.get_loudness(h5), hdf5_getters.get_analysis_sample_rate(h5), hdf5_getters.get_tempo(h5) )) # show progress count += 1 print "%d/10000" % (count) h5.close()
def get_all_attributes(filename): """ This function does 3 simple things: - open the song file - get all required attributes - write it to a csv file - close the files """ with open('attributes.csv', 'a') as csvfile: try: # let's apply the previous function to all files csvwriter = csv.writer(csvfile, delimiter='\t') h5 = GETTERS.open_h5_file_read(filename) RESULTS = [] RESULTS.append(GETTERS.get_year(h5)) RESULTS.append(GETTERS.get_artist_id(h5)) RESULTS.append(GETTERS.get_artist_name(h5)) RESULTS.append(GETTERS.get_artist_mbid(h5)) RESULTS.append(convert_terms(GETTERS.get_artist_terms(h5))) RESULTS.append(GETTERS.get_artist_hotttnesss(h5)) RESULTS.append(GETTERS.get_artist_latitude(h5)) RESULTS.append(GETTERS.get_artist_longitude(h5)) RESULTS.append(GETTERS.get_artist_familiarity(h5)) RESULTS.append(GETTERS.get_danceability(h5)) RESULTS.append(GETTERS.get_duration(h5)) RESULTS.append(GETTERS.get_energy(h5)) RESULTS.append(GETTERS.get_loudness(h5)) RESULTS.append(GETTERS.get_song_hotttnesss(h5)) RESULTS.append(GETTERS.get_song_id(h5)) RESULTS.append(GETTERS.get_tempo(h5)) RESULTS.append(GETTERS.get_time_signature(h5)) RESULTS.append(GETTERS.get_title(h5)) RESULTS.append(GETTERS.get_track_id(h5)) RESULTS.append(GETTERS.get_release(h5)) csvwriter.writerow(RESULTS) h5.close() except AttributeError: pass
def getURLFromH5(h5path): if not os.path.isfile(h5path): print 'invalid path (not a file):',h5path sys.exit(0) h5 = hdf5_utils.open_h5_file_read(h5path) track_7digitalid = GETTERS.get_track_7digitalid(h5) release_7digitalid = GETTERS.get_release_7digitalid(h5) artist_7digitalid = GETTERS.get_artist_7digitalid(h5) artist_name = GETTERS.get_artist_name(h5) release_name = GETTERS.get_release(h5) track_name = GETTERS.get_title(h5) h5.close() # we already have the 7digital track id? way too easy! print "Suggested Song URLs For you" print "===========================" if track_7digitalid >= 0: preview = get_preview_from_trackid(track_7digitalid) if preview == '': print 'something went wrong when looking by track id' else: print preview return preview sys.exit(0)
def func_to_extract_features(filename): """ This function does 3 simple things: - open the song file - get artist ID and put it - close the file """ global cntnan global cntdanceability global listfeatures global listhotness global listyear global listloudness global listkey global listmode global listduration cf = [] h5 = GETTERS.open_h5_file_read(filename) nanfound = 0 #Get target feature: song hotness #FEATURE 0 song_hotness = GETTERS.get_song_hotttnesss(h5) if math.isnan(song_hotness): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_hotness) #FEATURE 1 #Get song loudness song_loudness = GETTERS.get_loudness(h5) if math.isnan(song_loudness): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_loudness) #FEATURE 2 #Get key of the song song_key = GETTERS.get_key(h5) if math.isnan(song_key): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_key) #FEATURE 3 #Get duration of the song song_duration = GETTERS.get_duration(h5) if math.isnan(song_duration): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_duration) #FEATURE 4-15 #Get Average Pitch Class across all segments #Get the pitches (12 pitches histogram for each segment) pitches = GETTERS.get_segments_pitches(h5) M = np.mat(pitches) meanpitches = M.mean(axis=0) pitches_arr = np.asarray(meanpitches) pitches_list = [] for i in range(0,12): pitches_list.append(pitches_arr[0][i]) cf.append(pitches_list) #FEATURE 16, 27 #Get Average Timbre Class across all segments timbres = GETTERS.get_segments_timbre(h5) M = np.mat(timbres) meantimbres = M.mean(axis=0) timbre_arr = np.asarray(meantimbres) timbre_list = [] for i in range(0,12): timbre_list.append(timbre_arr[0][i]) cf.append(timbre_list) #FEATURE 28 #Get song year song_year = GETTERS.get_year(h5) if song_year == 0: nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_year) #FEATURE 29 #Get song tempo song_tempo = GETTERS.get_tempo(h5) cf.append(song_tempo) #Feature 30 #Get max loudness for each segment max_loudness_arr = GETTERS.get_segments_loudness_max(h5) start_loudness_arr = GETTERS.get_segments_loudness_start(h5) if nanfound == 0: cf.append(max(max_loudness_arr)-min(start_loudness_arr)) #Feature 31 artist_familiarity = GETTERS.get_artist_familiarity(h5) cf.append(artist_familiarity) #Feature 32 song_title = GETTERS.get_title(h5) cf.append(song_title) #Featture 33 artist_name = GETTERS.get_artist_name(h5) cf.append(artist_name) #Feature 34 #location = GETTERS.get_artist_location(h5) #cf.append(location) #Tags artist_mbtags = GETTERS.get_artist_mbtags(h5) if not artist_mbtags.size: genre = "Unknown" else: artist_mbcount = np.array(GETTERS.get_artist_mbtags_count(h5)) index_max = artist_mbcount.argmax(axis=0) genre = artist_mbtags[index_max] if genre == 'espa\xc3\xb1ol': genre = "Unknown" cf.append(genre) if nanfound == 0: strlist = list_to_csv(cf) listfeatures.append(strlist) mydict.setdefault(artist_name,[]).append(song_hotness) h5.close()
db = MySQLdb.connect(host="localhost",user="******",passwd="password",db="FinalProject") db.query("DELETE FROM artist WHERE artist_id = 'a';") cursor = db.cursor(MySQLdb.cursors.DictCursor) counter = 0 for subdir, dirs, files in os.walk("data/"): for file in files: f = os.path.join(subdir, file) if ".h5" in f: h5 = h.open_h5_file_read(f) print ("----------") ''' Store artist tuples ''' artist_id = h.get_artist_id(h5,0) artist_name = h.get_artist_name(h5,0) artist_name = artist_name.replace("'","") artist_hottness = str(h.get_artist_hotttnesss(h5,0)) print artist_hottness if artist_hottness == "nan": artist_hottness = "0.0" artist_familiarity = str(h.get_artist_familiarity(h5,0)) if artist_familiarity == "nan": artist_familiarity = "0.0" cursor.execute("SELECT * FROM artist WHERE artist_id = '" + artist_id + "'") rs = cursor.fetchall() if cursor.rowcount != 1: cursor.execute("INSERT INTO artist VALUES ('" + artist_id + "','" + artist_name + "'," + artist_hottness + "," + artist_familiarity + ");") ''' Store artist_genres tuples ''' terms = h.get_artist_terms(h5,0)
def classify(h5): output_array={} # duration duration=hdf5_getters.get_duration(h5) output_array["duration"]=duration ### ADDED VALUE TO ARRAY # number of bars bars=hdf5_getters.get_bars_start(h5) num_bars=len(bars) output_array["num_bars"]=num_bars ### ADDED VALUE TO ARRAY # mean and variance in bar length bar_length=numpy.ediff1d(bars) variance_bar_length=numpy.var(bar_length) output_array["variance_bar_length"]=variance_bar_length ### ADDED VALUE TO ARRAY # number of beats beats=hdf5_getters.get_beats_start(h5) num_beats=len(beats) output_array["num_beats"]=num_beats ### ADDED VALUE TO ARRAY # mean and variance in beats length beats_length=numpy.ediff1d(beats) variance_beats_length=numpy.var(bar_length) output_array["variance_beats_length"]=variance_beats_length ### ADDED VALUE TO ARRAY # danceability danceability=hdf5_getters.get_danceability(h5) output_array["danceability"]=danceability ### ADDED VALUE TO ARRAY # end of fade in end_of_fade_in=hdf5_getters.get_end_of_fade_in(h5) output_array["end_of_fade_in"]=end_of_fade_in ### ADDED VALUE TO ARRAY # energy energy=hdf5_getters.get_energy(h5) output_array["energy"]=energy ### ADDED VALUE TO ARRAY # key key=hdf5_getters.get_key(h5) output_array["key"]=int(key) ### ADDED VALUE TO ARRAY # loudness loudness=hdf5_getters.get_loudness(h5) output_array["loudness"]=loudness ### ADDED VALUE TO ARRAY # mode mode=hdf5_getters.get_mode(h5) output_array["mode"]=int(mode) ### ADDED VALUE TO ARRAY # number sections sections=hdf5_getters.get_sections_start(h5) num_sections=len(sections) output_array["num_sections"]=num_sections ### ADDED VALUE TO ARRAY # mean and variance in sections length sections_length=numpy.ediff1d(sections) variance_sections_length=numpy.var(sections) output_array["variance_sections_length"]=variance_sections_length ### ADDED VALUE TO ARRAY # number segments segments=hdf5_getters.get_segments_start(h5) num_segments=len(segments) output_array["num_segments"]=num_segments ### ADDED VALUE TO ARRAY # mean and variance in segments length segments_length=numpy.ediff1d(segments) variance_segments_length=numpy.var(segments) output_array["variance_segments_length"]=variance_segments_length ### ADDED VALUE TO ARRAY # segment loudness max segment_loudness_max_array=hdf5_getters.get_segments_loudness_max(h5) segment_loudness_max_time_array=hdf5_getters.get_segments_loudness_max_time(h5) segment_loudness_max_index=0 for i in range(len(segment_loudness_max_array)): if segment_loudness_max_array[i]>segment_loudness_max_array[segment_loudness_max_index]: segment_loudness_max_index=i segment_loudness_max=segment_loudness_max_array[segment_loudness_max_index] segment_loudness_max_time=segment_loudness_max_time_array[segment_loudness_max_index] output_array["segment_loudness_max"]=segment_loudness_max ### ADDED VALUE TO ARRAY output_array["segment_loudness_time"]=segment_loudness_max_time ### ADDED VALUE TO ARRAY # POSSIBLE TODO: use average function instead and weight by segment length # segment loudness mean (start) segment_loudness_array=hdf5_getters.get_segments_loudness_start(h5) segment_loudness_mean=numpy.mean(segment_loudness_array) output_array["segment_loudness_mean"]=segment_loudness_mean ### ADDED VALUE TO ARRAY # segment loudness variance (start) segment_loudness_variance=numpy.var(segment_loudness_array) output_array["segment_loudness_variance"]=segment_loudness_variance ### ADDED VALUE TO ARRAY # segment pitches segment_pitches_array=hdf5_getters.get_segments_pitches(h5) segment_pitches_mean=numpy.mean(segment_pitches_array,axis=0).tolist() output_array["segment_pitches_mean"]=segment_pitches_mean # segment pitches variance (start) segment_pitches_variance=numpy.var(segment_pitches_array,axis=0).tolist() output_array["segment_pitches_variance"]=segment_pitches_variance # segment timbres segment_timbres_array=hdf5_getters.get_segments_timbre(h5) segment_timbres_mean=numpy.mean(segment_timbres_array,axis=0).tolist() output_array["segment_timbres_mean"]=segment_timbres_mean # segment timbres variance (start) segment_timbres_variance=numpy.var(segment_timbres_array,axis=0).tolist() output_array["segment_timbres_variance"]=segment_timbres_variance # hotttnesss hottness=hdf5_getters.get_song_hotttnesss(h5,0) output_array["hottness"]=hottness ### ADDED VALUE TO ARRAY # duration-start of fade out start_of_fade_out=hdf5_getters.get_start_of_fade_out(h5) fade_out=duration-start_of_fade_out output_array["fade_out"]=fade_out ### ADDED VALUE TO ARRAY # tatums tatums=hdf5_getters.get_tatums_start(h5) num_tatums=len(tatums) output_array["num_tatums"]=num_tatums ### ADDED VALUE TO ARRAY # mean and variance in tatums length tatums_length=numpy.ediff1d(tatums) variance_tatums_length=numpy.var(tatums_length) output_array["variance_tatums_length"]=variance_tatums_length ### ADDED VALUE TO ARRAY # tempo tempo=hdf5_getters.get_tempo(h5) output_array["tempo"]=tempo ### ADDED VALUE TO ARRAY # time signature time_signature=hdf5_getters.get_time_signature(h5) output_array["time_signature"]=int(time_signature) ### ADDED VALUE TO ARRAY # year year=hdf5_getters.get_year(h5) output_array["year"]=int(year) ### ADDED VALUE TO ARRAY # artist terms artist_terms=hdf5_getters.get_artist_terms(h5,0) output_array["artist_terms"]=artist_terms.tolist() artist_terms_freq=hdf5_getters.get_artist_terms_freq(h5,0) output_array["artist_terms_freq"]=artist_terms_freq.tolist() artist_name=hdf5_getters.get_artist_name(h5,0) output_array["artist_name"]=artist_name artist_id=hdf5_getters.get_artist_id(h5,0) output_array["artist_id"]=artist_id # title title=hdf5_getters.get_title(h5,0) output_array["title"]=title return output_array
#print track = {} #Handle each one year = h5get.get_year(h5, i) if year < 1980 or year > 2010: continue; song = Song() #song.year = year #song.hotness = h5get.get_song_hotttnesss(h5, i) #print "Hotness: ", song.hotness; #if math.isnan(song.hotness): # song.hotness = 0.0; song.artist = h5get.get_artist_name(h5, i) song.name = h5get.get_title(h5, i) #track['track'] = str(song.artist) + " " + str(song.name) #track['hotness'] = float(song.hotness) track['artist'] = song.artist track['name'] = song.name song_list.append(track) #song.pop_score = calc_poffpop(song) #print "Poff Score", song.pop_score #all_songs.append(song) #print all_songs json.dump(song_list,w) w.close() """
def data_to_flat_file(basedir,ext='.h5') : """This function extract the information from the tables and creates the flat file.""" count = 0; #song counter list_to_write= [] row_to_write = "" writer = csv.writer(open("metadata_wholeA.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: print f #the name of the file h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title= title.replace('"','') comma=title.find(',') #eliminating commas in the title if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album= album.replace('"','') #eliminating commas in the album comma=album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma=artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name= artist_name.replace('"','') #eliminating double quotes duration = hdf5_getters.get_duration(h5) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam=-1 artist_hotness= hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness=-1 artist_id = hdf5_getters.get_artist_id(h5) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat=-1 artist_loc = hdf5_getters.get_artist_location(h5) #checks artist_loc to see if it is a hyperlink if it is set as empty string artist_loc = artist_loc.replace(",", "\,"); if artist_loc.startswith("<a"): artist_loc = "" if len(artist_loc) > 100: artist_loc = "" artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon=-1 artist_mbid = hdf5_getters.get_artist_mbid(h5) artist_pmid = hdf5_getters.get_artist_playmeid(h5) audio_md5 = hdf5_getters.get_audio_md5(h5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability=-1 end_fade_in =hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in=-1 energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy=-1 song_key = hdf5_getters.get_key(h5) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c=-1 loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness=-1 mode = hdf5_getters.get_mode(h5) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf=-1 release_7digitalid = hdf5_getters.get_release_7digitalid(h5) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot=-1 song_id = hdf5_getters.get_song_id(h5) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo=-1 time_sig = hdf5_getters.get_time_signature(h5) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c=-1 track_id = hdf5_getters.get_track_id(h5) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) year = hdf5_getters.get_year(h5) bars_c = hdf5_getters.get_bars_confidence(h5) bars_c_avg= get_avg(bars_c) bars_c_max= get_max(bars_c) bars_c_min = get_min(bars_c) bars_c_stddev= get_stddev(bars_c) bars_c_count = get_count(bars_c) bars_c_sum = get_sum(bars_c) bars_start = hdf5_getters.get_bars_start(h5) bars_start_avg = get_avg(bars_start) bars_start_max= get_max(bars_start) bars_start_min = get_min(bars_start) bars_start_stddev= get_stddev(bars_start) bars_start_count = get_count(bars_start) bars_start_sum = get_sum(bars_start) beats_c = hdf5_getters.get_beats_confidence(h5) beats_c_avg= get_avg(beats_c) beats_c_max= get_max(beats_c) beats_c_min = get_min(beats_c) beats_c_stddev= get_stddev(beats_c) beats_c_count = get_count(beats_c) beats_c_sum = get_sum(beats_c) beats_start = hdf5_getters.get_beats_start(h5) beats_start_avg = get_avg(beats_start) beats_start_max= get_max(beats_start) beats_start_min = get_min(beats_start) beats_start_stddev= get_stddev(beats_start) beats_start_count = get_count(beats_start) beats_start_sum = get_sum(beats_start) sec_c = hdf5_getters.get_sections_confidence(h5) sec_c_avg= get_avg(sec_c) sec_c_max= get_max(sec_c) sec_c_min = get_min(sec_c) sec_c_stddev= get_stddev(sec_c) sec_c_count = get_count(sec_c) sec_c_sum = get_sum(sec_c) sec_start = hdf5_getters.get_sections_start(h5) sec_start_avg = get_avg(sec_start) sec_start_max= get_max(sec_start) sec_start_min = get_min(sec_start) sec_start_stddev= get_stddev(sec_start) sec_start_count = get_count(sec_start) sec_start_sum = get_sum(sec_start) seg_c = hdf5_getters.get_segments_confidence(h5) seg_c_avg= get_avg(seg_c) seg_c_max= get_max(seg_c) seg_c_min = get_min(seg_c) seg_c_stddev= get_stddev(seg_c) seg_c_count = get_count(seg_c) seg_c_sum = get_sum(seg_c) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) seg_loud_max_avg= get_avg(seg_loud_max) seg_loud_max_max= get_max(seg_loud_max) seg_loud_max_min = get_min(seg_loud_max) seg_loud_max_stddev= get_stddev(seg_loud_max) seg_loud_max_count = get_count(seg_loud_max) seg_loud_max_sum = get_sum(seg_loud_max) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5) seg_loud_max_time_avg= get_avg(seg_loud_max_time) seg_loud_max_time_max= get_max(seg_loud_max_time) seg_loud_max_time_min = get_min(seg_loud_max_time) seg_loud_max_time_stddev= get_stddev(seg_loud_max_time) seg_loud_max_time_count = get_count(seg_loud_max_time) seg_loud_max_time_sum = get_sum(seg_loud_max_time) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) seg_loud_start_avg= get_avg(seg_loud_start) seg_loud_start_max= get_max(seg_loud_start) seg_loud_start_min = get_min(seg_loud_start) seg_loud_start_stddev= get_stddev(seg_loud_start) seg_loud_start_count = get_count(seg_loud_start) seg_loud_start_sum = get_sum(seg_loud_start) seg_pitch = hdf5_getters.get_segments_pitches(h5) pitch_size = len(seg_pitch) seg_start = hdf5_getters.get_segments_start(h5) seg_start_avg= get_avg(seg_start) seg_start_max= get_max(seg_start) seg_start_min = get_min(seg_start) seg_start_stddev= get_stddev(seg_start) seg_start_count = get_count(seg_start) seg_start_sum = get_sum(seg_start) seg_timbre = hdf5_getters.get_segments_timbre(h5) tatms_c = hdf5_getters.get_tatums_confidence(h5) tatms_c_avg= get_avg(tatms_c) tatms_c_max= get_max(tatms_c) tatms_c_min = get_min(tatms_c) tatms_c_stddev= get_stddev(tatms_c) tatms_c_count = get_count(tatms_c) tatms_c_sum = get_sum(tatms_c) tatms_start = hdf5_getters.get_tatums_start(h5) tatms_start_avg= get_avg(tatms_start) tatms_start_max= get_max(tatms_start) tatms_start_min = get_min(tatms_start) tatms_start_stddev= get_stddev(tatms_start) tatms_start_count = get_count(tatms_start) tatms_start_sum = get_sum(tatms_start) #Getting the genres genre_set = 0 #flag to see if the genre has been set or not art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes=get_genre_indexes(trm_freq) #index of the highest freq final_genre=[] genres_so_far=[] for i in range(len(genre_indexes)): genre_tmp=get_genre(art_trm,genre_indexes[i]) #genre that corresponds to the highest freq genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set=1 #genre was found in dictionary if genre_set == 1: col_num=[] for genre in final_genre: column=int(genre) #getting the column number of the genre col_num.append(column) genre_array=genre_columns(col_num) #genre array else: genre_array=genre_columns(-1) #the genre was not found in the dictionary transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg=[] seg_pitch_max=[] seg_pitch_min=[] seg_pitch_stddev=[] seg_pitch_count=[] seg_pitch_sum=[] i=0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i=i+1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg=[] seg_timbre_max=[] seg_timbre_min=[] seg_timbre_stddev=[] seg_timbre_count=[] seg_timbre_sum=[] i=0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i=i+1 #Writing to the flat file writer.writerow([title,album,artist_name,year,duration,seg_start_count, tempo]) h5.close() count=count+1; print count;
# This script converts the summary H5 files only 300MB to a csv file # Run only on the Master Node since h5_getters cannot open a remote(ie. HDFS) file if __name__ == "__main__": with open("fields.csv", "wb") as f: writer = csv.writer(f) # initialize the csv writer # for each track in the summary file, get the 11 fields and output to csv h5_file = hdf5_getters.open_h5_file_read("msd_summary_file.h5") for k in range(1000000): print "index!!!: ", k id = hdf5_getters.get_track_id(h5_file, k) # get track_id TRA13e39.. title = hdf5_getters.get_title(h5_file, k) # get song title artist_name = hdf5_getters.get_artist_name(h5_file, k) year = int(hdf5_getters.get_year(h5_file, k)) hotness = float(hdf5_getters.get_song_hotttnesss(h5_file, k)) artist_familiarity = float(hdf5_getters.get_artist_familiarity(h5_file, k)) f5 = int(hdf5_getters.get_key(h5_file, k)) # get key f2 = float(hdf5_getters.get_loudness(h5_file, k)) # get loudness f1 = float(hdf5_getters.get_tempo(h5_file, k)) # get tempo f4 = int(hdf5_getters.get_duration(h5_file, k)) # get duration f3 = float(hdf5_getters.get_time_signature(h5_file, k)) # get time signature # Get rid of missing info and change invalid numbers for meta data if not artist_name: artist_name = "unknown" if not artist_familiarity:
def get_song_info(h5): print '%s - %s | (%s) | %s bpm' % (hdf5_getters.get_artist_name(h5), hdf5_getters.get_title(h5), hdf5_getters.get_year(h5), hdf5_getters.get_tempo(h5))