def create_labels(songs, sp): # Songs now holds list of all file paths to each song as a string """ Goes through all songs in list to find the danceability from spotify param: the list of song absolute file path names, and the Spotify object to use to make calls returns: a [num_songs] array of danceability labels (-1,0,1) """ print("creating labels...") acc = 0 labels = [] broken_labels = [] for i in range(1,len(songs)): print(i) file_object= hdf.open_h5_file_read(songs[i]) artist_name = hdf.get_artist_name(file_object).decode("utf-8") title = re.sub(r"\(.*\)","",hdf.get_title(file_object).decode("utf-8")) query = "artist: " + artist_name + " track: " + title label = get_danceability(query, sp) if label != -1: labels.append(label) else: broken_labels.append(i) acc+=1 file_object.close() print("NUMBER OF LOST SONGS = ", acc) return np.array(labels, dtype=np.int32), np.array(broken_labels, dtype=np.int32)
def fetch_song_from_h5(h5_filepath): basename, extension = os.path.splitext(h5_filepath) if extension != '.h5': return audio_filepath = basename + '.mp3' h5 = hdf5_getters.open_h5_file_read(h5_filepath) track_id = hdf5_getters.get_track_7digitalid(h5) track_name = hdf5_getters.get_title(h5) artist_name = hdf5_getters.get_artist_name(h5) h5.close() consumer = oauth.Consumer(OAUTH_CLIENT_KEY, OAUTH_CLIENT_SECRET) token = oauth.Token(OAUTH_ACCESS_TOKEN, OAUTH_ACCESS_SECRET) request = oauth.Request.from_consumer_and_token( consumer, http_url=get_clip_url(track_id), is_form_encoded=True, parameters={'country': 'ww'}) signing_method = oauth.SignatureMethod_HMAC_SHA1() request.sign_request(signing_method, consumer, token) url = request.to_url() r = requests.get(url) if r.status_code not in (requests.codes.ok, requests.codes.not_found): print(r.status_code, r.headers, r.content) exit() if r.status_code == requests.codes.ok: print('FETCHED track {0} {1} {2}'.format( track_id, artist_name, track_name)) with open(audio_filepath, 'wb') as f: f.write(r.content) else: print('FAILED TO FETCH track {0} {1} {2}'.format( track_id, artist_name, track_name))
def read(self, path): files = os.listdir(path) import csv with open('library_csv.csv', 'w') as library_csv: writer = csv.writer(library_csv) writer.writerow([ 'Loudness', 'Danceability', 'Energy', 'Tempo', 'timeSignature', 'Title' ]) # get params for filename in files: # hdf5path = filename hdf5path = "Data/" + filename # hdf5path.replace("'","",2) # sanity check if not os.path.isfile(hdf5path): print 'ERROR: file', hdf5path, 'does not exist.' sys.exit(0) h5 = hdf5_getters.open_h5_file_read(hdf5path) # get all getters loudness = hdf5.get_loudness(h5) dance = hdf5.get_danceability(h5) energy = hdf5.get_energy(h5) tempo = hdf5.get_tempo(h5) ts = hdf5.get_time_signature(h5) title = hdf5.get_title(h5) writer.writerow([loudness, dance, energy, tempo, ts, title]) # print them h5.close() library_csv.close()
def debug_from_song_file(connect,h5path,verbose=0): """ Slow debugging function that takes a h5 file, reads the info, check the match with musicbrainz db, prints out the result. Only prints when we dont get exact match! RETURN counts of how many files we filled for years, tags """ import hdf5_utils as HDF5 import hdf5_getters as GETTERS h5 = HDF5.open_h5_file_read(h5path) title = GETTERS.get_title(h5) release = GETTERS.get_release(h5) artist = GETTERS.get_artist_name(h5) ambid = GETTERS.get_artist_mbid(h5) h5.close() # mbid gotmbid=1 if ambid=='': gotmbid = 0 if verbose>0: print('no mb id for:',artist) # year year = find_year_safemode(connect,ambid,title,release,artist) gotyear = 1 if year > 0 else 0 if verbose>0: print('no years for:',artist,'|',release,'|',title) # tags tags,counts = get_artist_tags(connect,ambid) gottags = 1 if len(tags) > 0 else 0 if gottags == 0 and verbose>0: print('no tags for:',artist) # return indicator for mbid, year, tag return gotmbid,gotyear,gottags
def process_song(self, song_path): # read file song_data = h5.open_h5_file_read(song_path) # process file #song_id = h5.get_song_id(song_data).decode('UTF-8') song_int_id = int(h5.get_track_7digitalid(song_data)) song_name = h5.get_title(song_data).decode('UTF-8').lower() artist_name = h5.get_artist_name(song_data).decode('UTF-8').lower() song_year = int(h5.get_year(song_data)) sp = SpotifyInterface() track_info = sp.search_track_info(artist_name, song_name) if track_info == None: song_data.close() return None timbre = self.ndarray_list_to_ndlist(h5.get_segments_timbre(song_data)) chroma = self.ndarray_list_to_ndlist(h5.get_segments_pitches(song_data)) song_data.close() song_dict = {'id': song_int_id, 'name': song_name, 'artist': artist_name, 'year': song_year, 'timbre': timbre, 'chroma': chroma, **track_info} return song_dict
def get_all_examples(basedir, genre_dict, ext='.h5'): """ From a base directory, goes through all subdirectories, and grabs all songs and their features and puts them into a pandas dataframe INPUT basedir - base directory of the dataset genre_dict - a dictionary mapping track id to genre based tagraum dataset ext - extension, .h5 by default RETURN dataframe containing all song examples """ features_vs_genre = pd.DataFrame() # iterate over all files in all subdirectories for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) # # count files # count += len(files) # apply function to all files for f in files: h5 = GETTERS.open_h5_file_read(f) song_id = GETTERS.get_track_id(h5).decode('utf-8') if (song_id in genre_dict): genre = genre_dict[song_id] year = GETTERS.get_year(h5) duration = GETTERS.get_duration(h5) end_of_fade_in = GETTERS.get_end_of_fade_in(h5) loudness = GETTERS.get_loudness(h5) song_hotttnesss = GETTERS.get_song_hotttnesss(h5) tempo = GETTERS.get_tempo(h5) key = GETTERS.get_key(h5) key_confidence = GETTERS.get_key_confidence(h5) mode = GETTERS.get_mode(h5) mode_confidence = GETTERS.get_mode_confidence(h5) time_signature = GETTERS.get_time_signature(h5) time_signature_confidence = GETTERS.get_time_signature_confidence( h5) artist_name = GETTERS.get_artist_name(h5) title = GETTERS.get_title(h5) # length of sections_start array gives us number of start num_sections = len(GETTERS.get_sections_start(h5)) num_segments = len(GETTERS.get_segments_confidence(h5)) example = pd.DataFrame( data=[ (artist_name, title, song_id, genre, year, key, key_confidence, mode, mode_confidence, time_signature, time_signature_confidence, duration, end_of_fade_in, loudness, song_hotttnesss, tempo, num_sections) ], columns=[ 'artist_name', 'title', 'song_id', 'genre', 'year', 'key', 'key_confidence', 'mode', 'mode_confidence', 'time_signature', 'time_signature_confidence', 'duration', 'end_of_fade_in', 'loudness', 'song_hotttnesss', 'tempo', 'num_segments' ]) features_vs_genre = features_vs_genre.append(example) h5.close() return features_vs_genre
def traverseAndWrite(root, genreDirs, genreKeys): if not isfile(root): for f in listdir(root): traverseAndWrite(root + "/" + f,genreDirs, genreKeys) else: h5 = hdf5_getters.open_h5_file_read(root) numOfSongs = hdf5_getters.get_num_songs(h5) for index in range(numOfSongs): tags = hdf5_getters.get_artist_mbtags(h5,index) # print tags artist = hdf5_getters.get_artist_name(h5,index) songName = hdf5_getters.get_title(h5,index) segmentTimbre = hdf5_getters.get_segments_timbre(h5,index) segmentPitches = hdf5_getters.get_segments_pitches(h5,index) if notValidSong(tags, artist, songName, segmentTimbre, segmentPitches): h5.close() continue for genre in genreKeys: if genreInTags(genre,tags): song = {} song['genre'] = genre song['artist_name'] = artist song['song_title'] = songName song['segments_pitches'] = segmentPitches.tolist() song['segments_timbre'] = segmentTimbre.tolist() valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) songName = ''.join(c for c in songName if c in valid_chars) artist = ''.join(c for c in artist if c in valid_chars) fd = open(genreDirs[genre]+"/"+artist+"--"+songName+".json",'a') writeToDescriptor(fd,song) fd.close() h5.close()
def parse_songs(directory): global count global MAX_SONGS for filename in os.listdir(directory): if count >= MAX_SONGS: return file_path = os.path.join(directory, filename) if os.path.isdir(file_path): parse_songs(file_path) else: count += 1 if count % 100 == 0: print('Parsed ' + str(count) + ' songs') with hdf5_getters.open_h5_file_read(file_path) as h5: for i in range(hdf5_getters.get_num_songs(h5)): title = hdf5_getters.get_title(h5, i).decode('UTF-8') year = hdf5_getters.get_year(h5, i).item() danceability = hdf5_getters.get_danceability(h5, i).item() tags = hdf5_getters.get_artist_mbtags(h5, i).tolist() genres = [tag.decode('UTF-8') for tag in tags] tempo = hdf5_getters.get_tempo(h5, i).item() song = { 'title': title, 'year': year, 'danceability': danceability, 'genres': genres, 'tempo': tempo } song = os.path.splitext(filename) with open( "/home/ubuntu/million_songs/parsed_data/" + song[0] + '.json', 'w') as fp: json.dump(song, fp)
def feat_from_file(path): """ Extract a list of features in an array, already converted to string """ feats = [] h5 = GETTERS.open_h5_file_read(path) # basic info feats.append(GETTERS.get_track_id(h5)) feats.append(GETTERS.get_artist_name(h5).decode().replace(',', '')) feats.append(GETTERS.get_title(h5).decode().replace(',', '')) feats.append(GETTERS.get_loudness(h5)) feats.append(GETTERS.get_tempo(h5)) feats.append(GETTERS.get_time_signature(h5)) feats.append(GETTERS.get_key(h5)) feats.append(GETTERS.get_mode(h5)) feats.append(GETTERS.get_duration(h5)) # timbre timbre = GETTERS.get_segments_timbre(h5) avg_timbre = np.average(timbre, axis=0) for k in avg_timbre: feats.append(k) var_timbre = np.var(timbre, axis=0) for k in var_timbre: feats.append(k) # done with h5 file h5.close() # makes sure we return strings feats = map(lambda x: str(x), feats) return feats
def get_key_tempo(filename): h5 = GETTERS.open_h5_file_read(filename) tempo = GETTERS.get_tempo(h5) key = GETTERS.get_key(h5) ar = GETTERS.get_artist_name(h5) title = GETTERS.get_title(h5) st = "" terms = None try: a = artist.Artist(str(ar)) terms = a.get_terms() time.sleep(.12) except EchoNestIOError as e: print "echonestIOerror" except EchoNestAPIError as e: if e.code == 3: time.sleep(1) elif e.code == 5: print "code is 5" else: print "error.." if terms: print terms[0]['name'] with open('points.csv', 'a') as fp: a = csv.writer(fp, delimiter=',') a.writerow([tempo, key, ar, title, terms[0]['name']]) h5.close()
def debug_from_song_file(connect, h5path, verbose=0): """ Slow debugging function that takes a h5 file, reads the info, check the match with musicbrainz db, prints out the result. Only prints when we dont get exact match! RETURN counts of how many files we filled for years, tags """ import hdf5_utils as HDF5 import hdf5_getters as GETTERS h5 = HDF5.open_h5_file_read(h5path) title = GETTERS.get_title(h5) release = GETTERS.get_release(h5) artist = GETTERS.get_artist_name(h5) ambid = GETTERS.get_artist_mbid(h5) h5.close() # mbid gotmbid = 1 if ambid == '': gotmbid = 0 if verbose > 0: print('no mb id for:', artist) # year year = find_year_safemode(connect, ambid, title, release, artist) gotyear = 1 if year > 0 else 0 if verbose > 0: print('no years for:', artist, '|', release, '|', title) # tags tags, counts = get_artist_tags(connect, ambid) gottags = 1 if len(tags) > 0 else 0 if gottags == 0 and verbose > 0: print('no tags for:', artist) # return indicator for mbid, year, tag return gotmbid, gotyear, gottags
def feat_from_file(path): feats = [] h5 = GETTERS.open_h5_file_read(path) feats.append( GETTERS.get_track_id(h5) ) feats.append( GETTERS.get_title(h5) ) feats.append( GETTERS.get_artist_name(h5) ) feats.append( GETTERS.get_year(h5) ) feats.append( GETTERS.get_loudness(h5) ) feats.append( GETTERS.get_tempo(h5) ) feats.append( GETTERS.get_time_signature(h5) ) feats.append( GETTERS.get_key(h5) ) feats.append( GETTERS.get_mode(h5) ) feats.append( GETTERS.get_duration(h5) ) #timbre timbre = GETTERS.get_segments_timbre(h5) avg_timbre = np.average(timbre, axis=0) for k in avg_timbre: feats.append(k) var_timbre = np.var(timbre, axis=0) for k in var_timbre: feats.append(k) h5.close() return feats
def get_info(basedir,ext='.h5') : # Create new text file for storing the result of JSON objects resultFile = open("result.txt", "w") # Going through all sub-directories under the base directory for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: # Open the HDF5 for reading the content h5 = hdf5_getters.open_h5_file_read(f) # Creating dictionary to convert to JSON object dictionary = {} # Storing all fields dictionary["song_title"] = hdf5_getters.get_title(h5).decode('Latin-1') dictionary["artist_name"] = hdf5_getters.get_artist_name(h5).decode('Latin-1') dictionary["key"] = float(hdf5_getters.get_key(h5)) dictionary["minor-major"] = float(hdf5_getters.get_mode(h5)) dictionary["hotness"] = hdf5_getters.get_song_hotttnesss(h5) dictionary["artist_location"] = hdf5_getters.get_artist_location(h5).decode('Latin-1') dictionary["longitude"] = float(hdf5_getters.get_artist_longitude(h5)) dictionary["latitude"] = float(hdf5_getters.get_artist_latitude(h5)) print(dictionary) # Write the created JSON object to the text file resultFile.write(str(json.dumps(dictionary)) + "\n") h5.close() resultFile.close()
def load_non_time_data(): years = [] ten_features=[] num = 0 for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: h5 = getter.open_h5_file_read(f) num += 1 print(num) try: year = getter.get_year(h5) if year!=0: years.append(year) title_length = len(getter.get_title(h5)) terms_length = len(getter.get_artist_terms(h5)) tags_length = len(getter.get_artist_mbtags(h5)) hotness = getter.get_artist_hotttnesss(h5) duration = getter.get_duration(h5) loudness = getter.get_loudness(h5) mode = getter.get_mode(h5) release_length = len(getter.get_release(h5)) tempo = getter.get_tempo(h5) name_length = len(getter.get_artist_name(h5)) ten_feature = np.hstack([title_length,tags_length, hotness, duration, terms_length, loudness, mode, release_length, tempo, name_length]) ten_features.append(ten_feature) except: print(1) h5.close() return years,ten_features
def get_all_titles(basedir, ext='.h5'): titles = [] artist_names = [] terms = [] loudness = [] segments_loudness_max = [] for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: h5 = hdf5_getters.open_h5_file_read(f) titles.append(hdf5_getters.get_title(h5)) artist_names.append(hdf5_getters.get_artist_name(h5)) try: terms.append(hdf5_getters.get_artist_terms(h5)) except: pass loudness.append(hdf5_getters.get_loudness(h5)) try: segments_loudness_max.append( hdf5_getters.get_segments_loudness_max(h5)) except: pass h5.close() return titles, artist_names, terms, loudness, segments_loudness_max
def process_song(self, song_path): song_data = h5.open_h5_file_read(song_path) song_id = h5.get_song_id(song_data).decode('UTF-8') song_int_id = int(h5.get_track_7digitalid(song_data)) song_name = h5.get_title(song_data).decode('UTF-8').lower() artist_name = h5.get_artist_name(song_data).decode('UTF-8').lower() song_year = int(h5.get_year(song_data)) timbre = self.ndarray_list_to_ndlist(h5.get_segments_timbre(song_data)) chroma = self.ndarray_list_to_ndlist( h5.get_segments_pitches(song_data)) song_data.close() song_dict = { 'id': song_int_id, 'source_id': song_id, 'name': song_name, 'artist': artist_name, 'year': song_year, 'timbre': timbre, 'chroma': chroma } return song_dict
def get_all_titles(basedir,ext='.h5') : titles = [] artist_names = [] terms = [] loudness = [] segments_loudness_max = [] for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: h5 = hdf5_getters.open_h5_file_read(f) titles.append(hdf5_getters.get_title(h5)) artist_names.append(hdf5_getters.get_artist_name(h5)) try: terms.append(hdf5_getters.get_artist_terms(h5)) except: pass loudness.append(hdf5_getters.get_loudness(h5)) try: segments_loudness_max.append(hdf5_getters.get_segments_loudness_max(h5)) except: pass h5.close() return titles, artist_names, terms, loudness, segments_loudness_max
def feat_from_file(path): """ Extract a list of features in an array, already converted to string """ feats = [] h5 = GETTERS.open_h5_file_read(path) # basic info feats.append( GETTERS.get_track_id(h5) ) feats.append( GETTERS.get_artist_name(h5).replace(',','') ) feats.append( GETTERS.get_title(h5).replace(',','') ) feats.append( GETTERS.get_loudness(h5) ) feats.append( GETTERS.get_tempo(h5) ) feats.append( GETTERS.get_time_signature(h5) ) feats.append( GETTERS.get_key(h5) ) feats.append( GETTERS.get_mode(h5) ) feats.append( GETTERS.get_duration(h5) ) # timbre timbre = GETTERS.get_segments_timbre(h5) avg_timbre = np.average(timbre,axis=0) for k in avg_timbre: feats.append(k) var_timbre = np.var(timbre,axis=0) for k in var_timbre: feats.append(k) # done with h5 file h5.close() # makes sure we return strings feats = map(lambda x: str(x), feats) return feats
def func_to_desired_song_data(filename): h5 = GETTERS.open_h5_file_read(filename) track_id = GETTERS.get_track_id(h5) for song in random_songs: if song[0] == track_id: print("FOUND ONE!") title = replace_characters(GETTERS.get_title(h5)) artist = replace_characters(GETTERS.get_artist_name(h5)) year = GETTERS.get_year(h5) tempo = GETTERS.get_tempo(h5) key = GETTERS.get_key(h5) loudness = GETTERS.get_loudness(h5) energy = GETTERS.get_energy(h5) danceability = GETTERS.get_danceability(h5) time_signature = GETTERS.get_time_signature(h5) mode = GETTERS.get_mode(h5) hotttness = GETTERS.get_song_hotttnesss(h5) song_data = { 'title': title, 'artist': artist, 'year': year, 'tempo': tempo, 'key': key, 'loudness': loudness, 'energy': energy, 'danceability': danceability, 'time_signature': time_signature, 'mode': mode, 'hotttness': hotttness } all_the_data.append(song_data) h5.close()
def get_attribute(files): array = [] count = 0 for f in files: temp = [] count += 1 print(f) h5 = hdf5_getters.open_h5_file_read(f) temp.append(hdf5_getters.get_num_songs(h5)) temp.append(hdf5_getters.get_artist_familiarity(h5)) temp.append(hdf5_getters.get_artist_hotttnesss(h5)) temp.append(hdf5_getters.get_danceability(h5)) temp.append(hdf5_getters.get_energy(h5)) temp.append(hdf5_getters.get_key(h5)) temp.append(hdf5_getters.get_key_confidence(h5)) temp.append(hdf5_getters.get_loudness(h5)) temp.append(hdf5_getters.get_mode(h5)) temp.append(hdf5_getters.get_mode_confidence(h5)) temp.append(hdf5_getters.get_tempo(h5)) temp.append(hdf5_getters.get_time_signature(h5)) temp.append(hdf5_getters.get_time_signature_confidence(h5)) temp.append(hdf5_getters.get_title(h5)) temp.append(hdf5_getters.get_artist_name(h5)) temp = np.nan_to_num(temp) array.append(temp) # if count%100 ==0: # print(array[count-100:count-1]) # kmean.fit(array[count-100:count-1]) h5.close() return array
def get_all_titles(basedir,ext='.h5') : titles = [] for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: h5 = hdf5_getters.open_h5_file_read(f) titles.append( hdf5_getters.get_title(h5) ) h5.close() return titles
def get_all_titles(basedir, ext='.h5'): titles = [] for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: h5 = hdf5_getters.open_h5_file_read(f) titles.append(hdf5_getters.get_title(h5)) h5.close() return titles
def hdf5_to_features(file_name): """ Receives path to HDF5 file, returns 2 lists of identification for the song as well as the features for the algorithm. Parameters ---------- file_name : str Absolute path to the HDF5 file. Returns ------- list1 : list List consisting of ID, song title and artist name. list2 : list 34 features to represent the song. """ with hdf5_getters.open_h5_file_read(file_name) as reader: # ID ID = hdf5_getters.get_song_id(reader) title = hdf5_getters.get_title(reader) artist = hdf5_getters.get_artist_name(reader) # Features 1-4 beat_starts = hdf5_getters.get_beats_start(reader) beat_durations = np.diff(beat_starts, axis=0) # try: tempo_10, tempo_90 = np.quantile(beat_durations, [0.1, 0.9]) # except: # print(beat_durations) # exit() temp_var = np.var(beat_durations) temp_mean = np.mean(beat_durations) # Features 5-8 segment_loudness = hdf5_getters.get_segments_loudness_max(reader) loud_10, loud_90 = np.quantile(segment_loudness, [0.1, 0.9]) loud_var = np.var(segment_loudness) loud_mean = np.mean(segment_loudness) # Features 9-21 pitch_dominance = hdf5_getters.get_segments_pitches(reader) pitch_means = pitch_dominance.mean(axis=0) pitch_var = pitch_means.var() # Features 22-34 timbre = hdf5_getters.get_segments_timbre(reader) timbre_means = timbre.mean(axis=0) timbre_var = timbre_means.var() return [ID, title, artist], [ tempo_10, tempo_90, temp_var, temp_mean, loud_10, loud_90, loud_var, loud_mean ] + list(pitch_means) + [pitch_var] + list(timbre_means) + [timbre_var]
def print_all_titles(basedir): ext = '.h5' # Get all files with extension .h5 for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) print(files) for f in files: h5 = hdf5_getters.open_h5_file_read(f) print(hdf5_getters.get_title(h5)) h5.close()
def main(): # print("we in") outputFile1 = open('../Datasets/MSDSubsetCSV.csv', 'w') csvRowString = "" csvRowString = "Title,ArtistName" csvAttributeList = re.split(',', csvRowString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() csvRowString += ",\n" basedir = '/Users/Owner/Desktop/School/2019-2020/COMP400/MillionSongSubset/' ext = ".h5" #FOR LOOP for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: print(f) songH5File = hdf5_getters.open_h5_file_read(f) song = Song(str(hdf5_getters.get_song_id(songH5File))) song.title = str(hdf5_getters.get_title(songH5File)).replace( "b'", "").lower() song.artistName = str( hdf5_getters.get_artist_name(songH5File)).replace("b'", "").lower() song.year = str(hdf5_getters.get_year(songH5File)) if (int(song.year) < 1990): print('nope', int(song.year)) continue for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'ArtistName'.lower(): csvRowString += "\"" + song.artistName.replace( "'", "") + "\"" #took out "\"" before and after elif attribute == 'Title'.lower(): csvRowString += "\"" + song.title.replace("'", "") + "\"" else: csvRowString += "Erm. This didn't work. Error. :( :(\n" csvRowString += "," #Remove the final comma from each row in the csv lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex - 1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" songH5File.close() outputFile1.close()
def func_to_get_instrumental(filename): h5 = GETTERS.open_h5_file_read(filename) tags = set(GETTERS.get_artist_mbtags(h5)) genres = {'classical', 'orchestral'} if tags.intersection(genres): d = {} d['artist'] = GETTERS.get_artist_name(h5) d['title'] = GETTERS.get_title(h5) song_id = GETTERS.get_song_id(h5) classical[song_id] = d h5.close()
def get_track_info(track,h5=None): #get song and artist of the track close = (h5== None) if h5==None: path = "../../msd_dense_subset/mood/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5" h5 = GETTERS.open_h5_file_read(path) artist = GETTERS.get_artist_name(h5) title = GETTERS.get_title(h5) if close: h5.close() return str(artist) + '-' + str(title)
def better_MSD_sample_dirslist(paths): """ get list of filenames, artist, song title for all h5 files in a list of MSD sample directories """ dirdata = [] for path in paths: dirlist = os.listdir(path) for fname in dirlist: with GETTERS.open_h5_file_read(path+fname) as h5: dirdata.append([fname, GETTERS.get_artist_name(h5),GETTERS.get_title(h5)]) return dirdata
def get_all_titles(basedir, ext='.h5'): for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: h5 = hdf5_getters.open_h5_file_read(f) title = (hdf5_getters.get_title(h5)) title = re.sub('[^A-Za-z0-9 ]+', '', title) name = hdf5_getters.get_artist_name(h5) h5.close() x, _, _ = ws.billboard(title, name) print title, x
def get_track_info(track, h5=None): #get song and artist of the track close = (h5 == None) if h5 == None: path = "../../../msd_dense_subset/dense/" + track[2] + "/" + track[ 3] + "/" + track[4] + "/" + track + ".h5" h5 = GETTERS.open_h5_file_read(path) artist = GETTERS.get_artist_name(h5) title = GETTERS.get_title(h5) if close: h5.close() return str(artist) + '-' + str(title)
def MSD_sample_dirlist(path): """ get list of filenames, artist, song title for all h5 files in an MSD sample directory """ dirpath = path dirlist = os.listdir(path) dirdata = [] for fname in dirlist: h5 = GETTERS.open_h5_file_read(dirpath+fname) dirdata.append([fname, GETTERS.get_artist_name(h5),GETTERS.get_title(h5)]) h5.close() return dirdata
def extract_data(filename): h5 = GETTERS.open_h5_file_read(filename) track_id = GETTERS.get_song_id(h5) if track_id in already: h5.close() return songdata[track_id].append(GETTERS.get_title(h5)) songdata[track_id].append(GETTERS.get_artist_name(h5)) songdata[track_id].append(GETTERS.get_duration) h5.close()
def getArtistNameAndSongName(filename): """ This function does 3 simple things: - open the song file - get artist ID and put it - close the file """ h5 = GETTERS.open_h5_file_read(filename) artist_name = GETTERS.get_artist_name(h5) song_name = GETTERS.get_title(h5) song_id = GETTERS.get_song_id(h5) songsAndArtists[str(song_id)] = tuple((artist_name,song_name)) h5.close()
def getInfo(files): infoList = np.array(['tid', 'artist', 'song']) for fil in files: curFile = getter.open_h5_file_read(fil) tid = fil.split('/')[-1].split('.')[0] curArtist = getter.get_artist_name(curFile) curTitle = getter.get_title(curFile) curArr = np.array([tid, curArtist, curTitle]) infoList = np.vstack([infoList, curArr]) curFile.close() return infoList
def _extractSongData(file_path, filename): # song_id, title, release, artist_name, year h5 = hdf5_getters.open_h5_file_read(file_path) track_id = filename[:-3] song_id = hdf5_getters.get_song_id(h5).decode('UTF-8') dig7_id = hdf5_getters.get_track_7digitalid(h5) title = hdf5_getters.get_title(h5).decode('UTF-8') release = hdf5_getters.get_release(h5).decode('UTF-8') artist_name = hdf5_getters.get_artist_name(h5).decode('UTF-8') year = hdf5_getters.get_year(h5) h5.close() # print(song_id, track_id, dig7_id, title, release, artist_name, year) return track_id, song_id, dig7_id, title, release, artist_name, year
def songinfo(if_str): songs_tracks = pickle.load(open ("../../msd_dense_subset/dense/songs_tracks.pkl",'r')); track = str(songs_tracks[if_str]) # build path path = "../../msd_dense_subset/dense/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5" h5 = GETTERS.open_h5_file_read(path) artist_name = GETTERS.get_artist_name(h5) song_name = GETTERS.get_title(h5) year = GETTERS.get_year(h5, 0) #segments = GETTERS.get_segments_start(h5, 0); #segments_pitches = GETTERS.get_segments_pitches(h5, 0) h5.close() return artist_name+ " - " +song_name + " (" +str(year) +")"
def getArtistNameAndSongName(filename): """ This function does 3 simple things: - open the song file - get artist ID and put it - close the file """ h5 = GETTERS.open_h5_file_read(filename) artist_name = GETTERS.get_artist_name(h5) song_name = GETTERS.get_title(h5) song_id = GETTERS.get_song_id(h5) songsAndArtists[str(song_id)] = tuple((artist_name, song_name)) h5.close()
def getInfo(files): infoList = np.array(['tid', 'artist', 'song']) for fil in files: curFile = getter.open_h5_file_read(fil) tid = fil.split('/')[-1].split('.')[0] curArtist = getter.get_artist_name(curFile) curTitle = getter.get_title(curFile) curArr = np.array([tid, curArtist, curTitle]) infoList = np.vstack([infoList, curArr]) curFile.close() return infoList
def func_to_get_desired_values(filename, returnValue = False): """ This function does 3 simple things: - open the song file - get the elements we want and put them in - close the file INPUT : filename - The name of the h5 file to be loaded """ global all_desired_data # Open file h5 = GETTERS.open_h5_file_read(filename) # Create and fill a record record = [] for element in elementsRequested: result = getattr(GETTERS, element)(h5) try: if result == '': result = 'Adlen - void' except: pass try: if isinstance(result, np.ndarray): if len(result) > 1: result = float(np.mean(result)) else: result = '' except: try: result = float(result) except: pass record.append(result) song_id = GETTERS.get_track_id(h5) artist_name = GETTERS.get_artist_name(h5) title = GETTERS.get_title(h5) artist_mbtags = GETTERS.get_artist_mbtags(h5) release = GETTERS.get_release(h5) song_id = unicode(song_id.decode('utf-8')) title = unicode(title.decode('utf-8')) artist_name = unicode(artist_name.decode('utf-8')) if not returnValue: all_desired_data.append([[[song_id, title, artist_name, elementsRequested], artist_name, title, artist_mbtags, release], record]) h5.close() if returnValue: return [[[song_id, title, artist_name, elementsRequested], artist_name, title, artist_mbtags, release], record]
def insert_song(): print('Inserting song tuples') conn = get_conn() cursor = get_cursor(conn) __id = None __title = None __avg_rate = None __release_date = None __duration = None __price = None __provider_name = None __genre_id = None __singer_id = None __download = None try: for i in range(hard.NUM_SONGS): __id = bytes2str(GETTERS.get_song_id(h5, i)) __title = bytes2str(GETTERS.get_title(h5, i)) __avg_rate = 0.0 # use int() to transform the numpy.int32 to int which is supported by Oracle __release_date = int(GETTERS.get_year(h5, i)) if __release_date == 0: __release_date = None __duration = int(GETTERS.get_duration(h5, i)) __price = InfoGenerator.gen_price() __provider_name = InfoGenerator.get_provider_name() __genre_id = InfoGenerator.get_genre_id() __singer_id = bytes2str(GETTERS.get_artist_id(h5, i)) __download = 0 cursor.execute(sql.INSERT_SONG, id = __id, title = __title, avg_rate = __avg_rate, release_date = __release_date, duration = __duration, price = __price, provider_name = __provider_name, genre_id = __genre_id, singer_id = __singer_id, download = __download) songs[i] = Song(__id, __title, __avg_rate, __release_date, __duration, __price, __provider_name, __genre_id, __singer_id, __download) return 0 except Exception as e: print(e) print('i:', i, '\nid:',__id, '\ntitle:', __title, '\navg_rate:', __avg_rate, '\nrelease_date:', __release_date, '\nduration', __duration, '\nprice', __price, 'provider_name:',__provider_name, '\ngenre_id:', __genre_id, '\nsinger_id', __singer_id, 'download:',__download) return -1 finally: conn.commit() close_all(conn, cursor)
def MSD_sample_dirlist_save(path,file_path): """ get list of filenames, artist, song title for all h5 files in an MSD sample directory and save to csv """ import csv dirpath = path dirlist = os.listdir(path) dirdata = [] for fname in dirlist: h5 = GETTERS.open_h5_file_read(dirpath+fname) dirdata.append([dirpath, fname, GETTERS.get_artist_name(h5),GETTERS.get_title(h5)]) h5.close() listwriter = csv.writer(open(file_path,'a'), delimiter=',',quotechar='|',quoting=csv.QUOTE_MINIMAL) listwriter.writerows(dirdata) return dirdata
def songinfo(if_str): songs_tracks = pickle.load( open("../../msd_dense_subset/dense/songs_tracks.pkl", 'r')) track = str(songs_tracks[if_str]) # build path path = "../../msd_dense_subset/dense/" + track[2] + "/" + track[ 3] + "/" + track[4] + "/" + track + ".h5" h5 = GETTERS.open_h5_file_read(path) artist_name = GETTERS.get_artist_name(h5) song_name = GETTERS.get_title(h5) year = GETTERS.get_year(h5, 0) #segments = GETTERS.get_segments_start(h5, 0); #segments_pitches = GETTERS.get_segments_pitches(h5, 0) h5.close() return artist_name + " - " + song_name + " (" + str(year) + ")"
def get_all_titles(basedir, ext='.h5'): titles = [] count = 0 for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: h5 = hdf5_getters.open_h5_file_read(f) titles.append(hdf5_getters.get_title(h5)) h5.close() count += 1 if count % 100 == 0: print(count, ) if count == 1000: return titles return titles
def get_url(h5_file): artist_name = GETTERS.get_artist_name(h5_file) track_name = GETTERS.get_title(h5_file) echo_nest_id = GETTERS.get_track_id(h5_file).lower() if echo_nest_id >= 0: preview = get_preview_from_trackid(echo_nest_id) if preview != '': return preview res = get_trackid_from_text_search(track_name, artistname=artist_name) if len(res) > 0: closest_track = get_closest_track(res, track_name) preview = get_preview_from_trackid(closest_track['id']) return preview return None
def process_song(h5_song_file): song = {} song['artist_familiarity'] = hdf5_getters.get_artist_familiarity(h5) song['artist_id'] = hdf5_getters.get_artist_id(h5) song['artist_name'] = hdf5_getters.get_artist_name(h5) song['artist_hotttnesss'] = hdf5_getters.get_artist_hotttnesss(h5); song['title'] = hdf5_getters.get_title(h5) terms = hdf5_getters.get_artist_terms(h5) terms_freq = hdf5_getters.get_artist_terms_freq(h5) terms_weight = hdf5_getters.get_artist_terms_weight(h5) terms_array = [] # Creating a array of [term, its frequency, its weight]. Doing this for all terms associated # with the artist for i in range(len(terms)): terms_array.append([terms[i], terms_freq[i], terms_weight[i]]) song['artist_terms'] = terms_array beats_start = hdf5_getters.get_beats_start(h5) song['beats_start_variance'] = variance(beats_start) #beats variance in yocto seconds(10^-24s) song['number_of_beats'] = len(beats_start) song['duration'] = hdf5_getters.get_duration(h5) song['loudness'] = hdf5_getters.get_loudness(h5) sections_start = hdf5_getters.get_sections_start(h5) song['sections_start_variance'] = variance(sections_start) song['number_of_sections'] = len(sections_start) segments_pitches = hdf5_getters.get_segments_pitches(h5) (a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) = split_segments(segments_pitches) song['segments_pitches_variance'] = [variance(a0), variance(a1), variance(a2), variance(a3), variance(a4), variance(a5), variance(a6), variance(a7), variance(a8), variance(a9), variance(a10), variance(a11)] song['segments_pitches_mean'] = [mean(a0), mean(a1), mean(a2), mean(a3), mean(a4), mean(a5), mean(a6), mean(a7), mean(a8), mean(a9), mean(a10), mean(a11)] segments_timbre = hdf5_getters.get_segments_timbre(h5) (a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) = split_segments(segments_timbre) song['segments_timbre_variance'] = [variance(a0), variance(a1), variance(a2), variance(a3), variance(a4), variance(a5), variance(a6), variance(a7), variance(a8), variance(a9), variance(a10), variance(a11)] song['segments_timbre_mean'] = [mean(a0), mean(a1), mean(a2), mean(a3), mean(a4), mean(a5), mean(a6), mean(a7), mean(a8), mean(a9), mean(a10), mean(a11)] song['tempo'] = hdf5_getters.get_tempo(h5) song['_id'] = hdf5_getters.get_song_id(h5) song['year'] = hdf5_getters.get_year(h5) return song
def load_raw_data(): years = [] ten_features=[] timbres = [] pitches = [] min_length = 10000 num = 0 for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: h5 = getter.open_h5_file_read(f) num += 1 print(num) try: year = getter.get_year(h5) if year!=0: timbre = getter.get_segments_timbre(h5) s = np.size(timbre,0) if s>=100: if s<min_length: min_length = s pitch = getter.get_segments_pitches(h5) years.append(year) timbres.append(timbre) pitches.append(pitch) title_length = len(getter.get_title(h5)) terms_length = len(getter.get_artist_terms(h5)) tags_length = len(getter.get_artist_mbtags(h5)) hotness = getter.get_artist_hotttnesss(h5) duration = getter.get_duration(h5) loudness = getter.get_loudness(h5) mode = getter.get_mode(h5) release_length = len(getter.get_release(h5)) tempo = getter.get_tempo(h5) name_length = len(getter.get_artist_name(h5)) ten_feature = np.hstack([title_length, hotness, duration, tags_length, terms_length,loudness, mode, release_length, tempo, name_length]) ten_features.append(ten_feature) except: print(1) h5.close() return years, timbres, pitches,min_length,ten_features
def h5_to_csv_fields(h5,song): '''Converts h5 format to text Inputs: h5, an h5 file object, usable with the wrapper code MSongsDB song, an integer, representing which song in the h5 file to take the info out of (h5 files contain many songs) Output: a string representing all the information of this song, as a single line of a csv file ''' rv=[] ##All these are regular getter functions from wrapper code rv.append(gt.get_artist_name(h5,song)) rv.append(gt.get_title(h5, song)) rv.append(gt.get_release(h5, song)) rv.append(gt.get_year(h5,song)) rv.append(gt.get_duration(h5,song)) rv.append(gt.get_artist_familiarity(h5,song)) rv.append(gt.get_artist_hotttnesss(h5,song)) rv.append(gt.get_song_hotttnesss(h5, song)) ##artist_terms, artist_terms_freq, and artist_terms_weight getter functions ##are all arrays, so we need to turn them into strings first. We used '_' as a separator rv.append(array_to_csv_field(list(gt.get_artist_terms(h5,song)))) rv.append(array_to_csv_field(list(gt.get_artist_terms_freq(h5,song)))) rv.append(array_to_csv_field(list(gt.get_artist_terms_weight(h5,song)))) rv.append(gt.get_mode(h5,song)) rv.append(gt.get_key(h5,song)) rv.append(gt.get_tempo(h5,song)) rv.append(gt.get_loudness(h5,song)) rv.append(gt.get_danceability(h5,song)) rv.append(gt.get_energy(h5,song)) rv.append(gt.get_time_signature(h5,song)) rv.append(array_to_csv_field(list(gt.get_segments_start(h5,song)))) ##These arrays have vectors (Arrays) as items, 12 dimensional each ##An array like [[1,2,3],[4,5,6]] will be written to csv as '1;2;3_4;5;6', i.e. there's two types of separators rv.append(double_Array_to_csv_field(list(gt.get_segments_timbre(h5,song)),'_',';')) rv.append(double_Array_to_csv_field(list(gt.get_segments_pitches(h5,song)),'_',';')) rv.append(array_to_csv_field(list(gt.get_segments_loudness_start(h5,song)))) rv.append(array_to_csv_field(list(gt.get_segments_loudness_max(h5,song)))) rv.append(array_to_csv_field(list(gt.get_segments_loudness_max_time(h5,song)))) rv.append(array_to_csv_field(list(gt.get_sections_start(h5,song)))) ##turn this list into a string with comma separators (i.e. a csv line) rv_string=array_to_csv_field(rv, ",") rv_string+="\n" return rv_string
def get_all_data(target, basedir, ext='.h5') : # header target.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( "track_id", "song_id", "title", "artist_name", "artist_location", "artist_hotttnesss", "release", "year", "song_hotttnesss", "danceability", "duration", "loudness", "sample_rate", "tempo" )) count = 0 for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: for line in f: new_file = open("tmp.txt", 'w') new_file.write(line) h5 = hdf5_getters.open_h5_file_read(new_file) target.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( hdf5_getters.get_track_id(h5), hdf5_getters.get_song_id(h5), hdf5_getters.get_title(h5), hdf5_getters.get_artist_name(h5), hdf5_getters.get_artist_location(h5), hdf5_getters.get_artist_hotttnesss(h5), hdf5_getters.get_release(h5), hdf5_getters.get_year(h5), hdf5_getters.get_song_hotttnesss(h5), hdf5_getters.get_danceability(h5), hdf5_getters.get_duration(h5), hdf5_getters.get_loudness(h5), hdf5_getters.get_analysis_sample_rate(h5), hdf5_getters.get_tempo(h5) )) # show progress count += 1 print "%d/10000" % (count) h5.close()
def get_all_attributes(filename): """ This function does 3 simple things: - open the song file - get all required attributes - write it to a csv file - close the files """ with open('attributes.csv', 'a') as csvfile: try: # let's apply the previous function to all files csvwriter = csv.writer(csvfile, delimiter='\t') h5 = GETTERS.open_h5_file_read(filename) RESULTS = [] RESULTS.append(GETTERS.get_year(h5)) RESULTS.append(GETTERS.get_artist_id(h5)) RESULTS.append(GETTERS.get_artist_name(h5)) RESULTS.append(GETTERS.get_artist_mbid(h5)) RESULTS.append(convert_terms(GETTERS.get_artist_terms(h5))) RESULTS.append(GETTERS.get_artist_hotttnesss(h5)) RESULTS.append(GETTERS.get_artist_latitude(h5)) RESULTS.append(GETTERS.get_artist_longitude(h5)) RESULTS.append(GETTERS.get_artist_familiarity(h5)) RESULTS.append(GETTERS.get_danceability(h5)) RESULTS.append(GETTERS.get_duration(h5)) RESULTS.append(GETTERS.get_energy(h5)) RESULTS.append(GETTERS.get_loudness(h5)) RESULTS.append(GETTERS.get_song_hotttnesss(h5)) RESULTS.append(GETTERS.get_song_id(h5)) RESULTS.append(GETTERS.get_tempo(h5)) RESULTS.append(GETTERS.get_time_signature(h5)) RESULTS.append(GETTERS.get_title(h5)) RESULTS.append(GETTERS.get_track_id(h5)) RESULTS.append(GETTERS.get_release(h5)) csvwriter.writerow(RESULTS) h5.close() except AttributeError: pass
def getURLFromH5(h5path): if not os.path.isfile(h5path): print 'invalid path (not a file):',h5path sys.exit(0) h5 = hdf5_utils.open_h5_file_read(h5path) track_7digitalid = GETTERS.get_track_7digitalid(h5) release_7digitalid = GETTERS.get_release_7digitalid(h5) artist_7digitalid = GETTERS.get_artist_7digitalid(h5) artist_name = GETTERS.get_artist_name(h5) release_name = GETTERS.get_release(h5) track_name = GETTERS.get_title(h5) h5.close() # we already have the 7digital track id? way too easy! print "Suggested Song URLs For you" print "===========================" if track_7digitalid >= 0: preview = get_preview_from_trackid(track_7digitalid) if preview == '': print 'something went wrong when looking by track id' else: print preview return preview sys.exit(0)
def func_to_extract_features(filename): """ This function does 3 simple things: - open the song file - get artist ID and put it - close the file """ global cntnan global cntdanceability global listfeatures global listhotness global listyear global listloudness global listkey global listmode global listduration cf = [] h5 = GETTERS.open_h5_file_read(filename) nanfound = 0 #Get target feature: song hotness #FEATURE 0 song_hotness = GETTERS.get_song_hotttnesss(h5) if math.isnan(song_hotness): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_hotness) #FEATURE 1 #Get song loudness song_loudness = GETTERS.get_loudness(h5) if math.isnan(song_loudness): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_loudness) #FEATURE 2 #Get key of the song song_key = GETTERS.get_key(h5) if math.isnan(song_key): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_key) #FEATURE 3 #Get duration of the song song_duration = GETTERS.get_duration(h5) if math.isnan(song_duration): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_duration) #FEATURE 4-15 #Get Average Pitch Class across all segments #Get the pitches (12 pitches histogram for each segment) pitches = GETTERS.get_segments_pitches(h5) M = np.mat(pitches) meanpitches = M.mean(axis=0) pitches_arr = np.asarray(meanpitches) pitches_list = [] for i in range(0,12): pitches_list.append(pitches_arr[0][i]) cf.append(pitches_list) #FEATURE 16, 27 #Get Average Timbre Class across all segments timbres = GETTERS.get_segments_timbre(h5) M = np.mat(timbres) meantimbres = M.mean(axis=0) timbre_arr = np.asarray(meantimbres) timbre_list = [] for i in range(0,12): timbre_list.append(timbre_arr[0][i]) cf.append(timbre_list) #FEATURE 28 #Get song year song_year = GETTERS.get_year(h5) if song_year == 0: nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_year) #FEATURE 29 #Get song tempo song_tempo = GETTERS.get_tempo(h5) cf.append(song_tempo) #Feature 30 #Get max loudness for each segment max_loudness_arr = GETTERS.get_segments_loudness_max(h5) start_loudness_arr = GETTERS.get_segments_loudness_start(h5) if nanfound == 0: cf.append(max(max_loudness_arr)-min(start_loudness_arr)) #Feature 31 artist_familiarity = GETTERS.get_artist_familiarity(h5) cf.append(artist_familiarity) #Feature 32 song_title = GETTERS.get_title(h5) cf.append(song_title) #Featture 33 artist_name = GETTERS.get_artist_name(h5) cf.append(artist_name) #Feature 34 #location = GETTERS.get_artist_location(h5) #cf.append(location) #Tags artist_mbtags = GETTERS.get_artist_mbtags(h5) if not artist_mbtags.size: genre = "Unknown" else: artist_mbcount = np.array(GETTERS.get_artist_mbtags_count(h5)) index_max = artist_mbcount.argmax(axis=0) genre = artist_mbtags[index_max] if genre == 'espa\xc3\xb1ol': genre = "Unknown" cf.append(genre) if nanfound == 0: strlist = list_to_csv(cf) listfeatures.append(strlist) mydict.setdefault(artist_name,[]).append(song_hotness) h5.close()
def data_to_flat_file(basedir,ext='.h5') : """ This function extracts the information from the tables and creates the flat file. """ count = 0; #song counter list_to_write= [] group_index=0 row_to_write = "" writer = csv.writer(open("complete.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: row=[] print f h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title= title.replace('"','') row.append(title) comma=title.find(',') if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album= album.replace('"','') row.append(album) comma=album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma=artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name= artist_name.replace('"','') row.append(artist_name) duration = hdf5_getters.get_duration(h5) row.append(duration) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) row.append(samp_rt) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) row.append(artist_7digitalid) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam=-1 row.append(artist_fam) artist_hotness= hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness=-1 row.append(artist_hotness) artist_id = hdf5_getters.get_artist_id(h5) row.append(artist_id) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat=-1 row.append(artist_lat) artist_loc = hdf5_getters.get_artist_location(h5) row.append(artist_loc) artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon=-1 row.append(artist_lon) artist_mbid = hdf5_getters.get_artist_mbid(h5) row.append(artist_mbid) #Getting the genre art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes=get_genre_indexes(trm_freq) #index of the highest freq genre_set=0 #flag to see if the genre has been set or not final_genre=[] genres_so_far=[] for i in range(len(genre_indexes)): genre_tmp=get_genre(art_trm,genre_indexes[i]) #genre that corresponds to the highest freq genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set=1 if genre_set == 1: col_num=[] for i in final_genre: column=int(i) #getting the column number of the genre col_num.append(column) genre_array=genre_columns(col_num) #genre array for i in range(len(genre_array)): #appending the genre_array to the row row.append(genre_array[i]) else: genre_array=genre_columns(-1) #when there is no genre matched, return an array of [0...0] for i in range(len(genre_array)): #appending the genre_array to the row row.append(genre_array[i]) artist_pmid = hdf5_getters.get_artist_playmeid(h5) row.append(artist_pmid) audio_md5 = hdf5_getters.get_audio_md5(h5) row.append(audio_md5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability=-1 row.append(danceability) end_fade_in =hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in=-1 row.append(end_fade_in) energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy=-1 row.append(energy) song_key = hdf5_getters.get_key(h5) row.append(song_key) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c=-1 row.append(key_c) loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness=-1 row.append(loudness) mode = hdf5_getters.get_mode(h5) row.append(mode) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf=-1 row.append(mode_conf) release_7digitalid = hdf5_getters.get_release_7digitalid(h5) row.append(release_7digitalid) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot=-1 row.append(song_hot) song_id = hdf5_getters.get_song_id(h5) row.append(song_id) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) row.append(start_fade_out) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo=-1 row.append(tempo) time_sig = hdf5_getters.get_time_signature(h5) row.append(time_sig) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c=-1 row.append(time_sig_c) track_id = hdf5_getters.get_track_id(h5) row.append(track_id) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) row.append(track_7digitalid) year = hdf5_getters.get_year(h5) row.append(year) bars_c = hdf5_getters.get_bars_confidence(h5) bars_start = hdf5_getters.get_bars_start(h5) row_bars_padding=padding(245) #this is the array that will be attached at the end of th row #--------------bars---------------" gral_info=[] gral_info=row[:] empty=[] for i,item in enumerate(bars_c): row.append(group_index) row.append(i) row.append(bars_c[i]) bars_c_avg= get_avg(bars_c) row.append(bars_c_avg) bars_c_max= get_max(bars_c) row.append(bars_c_max) bars_c_min = get_min(bars_c) row.append(bars_c_min) bars_c_stddev= get_stddev(bars_c) row.append(bars_c_stddev) bars_c_count = get_count(bars_c) row.append(bars_c_count) bars_c_sum = get_sum(bars_c) row.append(bars_c_sum) row.append(bars_start[i]) bars_start_avg = get_avg(bars_start) row.append(bars_start_avg) bars_start_max= get_max(bars_start) row.append(bars_start_max) bars_start_min = get_min(bars_start) row.append(bars_start_min) bars_start_stddev= get_stddev(bars_start) row.append(bars_start_stddev) bars_start_count = get_count(bars_start) row.append(bars_start_count) bars_start_sum = get_sum(bars_start) row.append(bars_start_sum) for i in row_bars_padding: row.append(i) writer.writerow(row) row=[] row=gral_info[:] #--------beats---------------" beats_c = hdf5_getters.get_beats_confidence(h5) group_index=1 row=[] row=gral_info[:] row_front=padding(14) #blanks left in front of the row(empty spaces for bars) row_beats_padding=padding(231) for i,item in enumerate(beats_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the beats row.append(index) row.append(beats_c[i]) beats_c_avg= get_avg(beats_c) row.append(beats_c_avg) beats_c_max= get_max(beats_c) row.append(beats_c_max) beats_c_min = get_min(beats_c) row.append(beats_c_min) beats_c_stddev= get_stddev(beats_c) row.append(beats_c_stddev) beats_c_count = get_count(beats_c) row.append(beats_c_count) beats_c_sum = get_sum(beats_c) row.append(beats_c_sum) beats_start = hdf5_getters.get_beats_start(h5) row.append(beats_start[i]) beats_start_avg = get_avg(beats_start) row.append(beats_start_avg) beats_start_max= get_max(beats_start) row.append(beats_start_max) beats_start_min = get_min(beats_start) row.append(beats_start_min) beats_start_stddev= get_stddev(beats_start) row.append(beats_start_stddev) beats_start_count = get_count(beats_start) row.append(beats_start_count) beats_start_sum = get_sum(beats_start) row.append(beats_start_sum) for i in row_beats_padding: row.append(i) writer.writerow(row) row=[] row=gral_info[:] # "--------sections---------------" row_sec_padding=padding(217) #blank spaces left at the end of the row sec_c = hdf5_getters.get_sections_confidence(h5) group_index=2 row=[] row=gral_info[:] row_front=padding(28) #blank spaces left in front(empty spaces for bars,beats) for i,item in enumerate(sec_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the sections row.append(index) row.append(sec_c[i]) sec_c_avg= get_avg(sec_c) row.append(sec_c_avg) sec_c_max= get_max(sec_c) row.append(sec_c_max) sec_c_min = get_min(sec_c) row.append(sec_c_min) sec_c_stddev= get_stddev(sec_c) row.append(sec_c_stddev) sec_c_count = get_count(sec_c) row.append(sec_c_count) sec_c_sum = get_sum(sec_c) row.append(sec_c_sum) sec_start = hdf5_getters.get_sections_start(h5) row.append(sec_start[i]) sec_start_avg = get_avg(sec_start) row.append(sec_start_avg) sec_start_max= get_max(sec_start) row.append(sec_start_max) sec_start_min = get_min(sec_start) row.append(sec_start_min) sec_start_stddev= get_stddev(sec_start) row.append(sec_start_stddev) sec_start_count = get_count(sec_start) row.append(sec_start_count) sec_start_sum = get_sum(sec_start) row.append(sec_start_sum) for i in row_sec_padding: #appending the blank spaces at the end of the row row.append(i) writer.writerow(row) row=[] row=gral_info[:] #--------segments---------------" row_seg_padding=padding(182) #blank spaces at the end of the row row_front=padding(42) #blank spaces left in front of segments seg_c = hdf5_getters.get_segments_confidence(h5) group_index=3 row=[] row=gral_info[:] for i,item in enumerate(seg_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the segments row.append(index) row.append(seg_c[i]) seg_c_avg= get_avg(seg_c) row.append(seg_c_avg) seg_c_max= get_max(seg_c) row.append(seg_c_max) seg_c_min = get_min(seg_c) row.append(seg_c_min) seg_c_stddev= get_stddev(seg_c) row.append(seg_c_stddev) seg_c_count = get_count(seg_c) row.append(seg_c_count) seg_c_sum = get_sum(seg_c) row.append(seg_c_sum) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) row.append(seg_loud_max[i]) seg_loud_max_avg= get_avg(seg_loud_max) row.append(seg_loud_max_avg) seg_loud_max_max= get_max(seg_loud_max) row.append(seg_loud_max_max) seg_loud_max_min = get_min(seg_loud_max) row.append(seg_loud_max_min) seg_loud_max_stddev= get_stddev(seg_loud_max) row.append(seg_loud_max_stddev) seg_loud_max_count = get_count(seg_loud_max) row.append(seg_loud_max_count) seg_loud_max_sum = get_sum(seg_loud_max) row.append(seg_loud_max_sum) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5) row.append(seg_loud_max_time[i]) seg_loud_max_time_avg= get_avg(seg_loud_max_time) row.append(seg_loud_max_time_avg) seg_loud_max_time_max= get_max(seg_loud_max_time) row.append(seg_loud_max_time_max) seg_loud_max_time_min = get_min(seg_loud_max_time) row.append(seg_loud_max_time_min) seg_loud_max_time_stddev= get_stddev(seg_loud_max_time) row.append(seg_loud_max_time_stddev) seg_loud_max_time_count = get_count(seg_loud_max_time) row.append(seg_loud_max_time_count) seg_loud_max_time_sum = get_sum(seg_loud_max_time) row.append(seg_loud_max_time_sum) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) row.append(seg_loud_start[i]) seg_loud_start_avg= get_avg(seg_loud_start) row.append(seg_loud_start_avg) seg_loud_start_max= get_max(seg_loud_start) row.append(seg_loud_start_max) seg_loud_start_min = get_min(seg_loud_start) row.append(seg_loud_start_min) seg_loud_start_stddev= get_stddev(seg_loud_start) row.append(seg_loud_start_stddev) seg_loud_start_count = get_count(seg_loud_start) row.append(seg_loud_start_count) seg_loud_start_sum = get_sum(seg_loud_start) row.append(seg_loud_start_sum) seg_start = hdf5_getters.get_segments_start(h5) row.append(seg_start[i]) seg_start_avg= get_avg(seg_start) row.append(seg_start_avg) seg_start_max= get_max(seg_start) row.append(seg_start_max) seg_start_min = get_min(seg_start) row.append(seg_start_min) seg_start_stddev= get_stddev(seg_start) row.append(seg_start_stddev) seg_start_count = get_count(seg_start) row.append(seg_start_count) seg_start_sum = get_sum(seg_start) row.append(seg_start_sum) for i in row_seg_padding: #appending blank spaces at the end of the row row.append(i) writer.writerow(row) row=[] row=gral_info[:] #----------segments pitch and timbre---------------" row_seg2_padding=padding(14) #blank spaces left at the end of the row row_front=padding(77) #blank spaces left at the front of the segments and timbre seg_pitch = hdf5_getters.get_segments_pitches(h5) transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows group_index=4 row=[] row=gral_info[:] for i,item in enumerate(transpose_pitch[0]): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of segments and timbre row.append(index) row.append(transpose_pitch[0][i]) seg_pitch_avg= get_avg(transpose_pitch[0]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[0]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[0]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[0]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[0]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[0]) row.append(seg_pitch_sum) row.append(transpose_pitch[1][i]) seg_pitch_avg= get_avg(transpose_pitch[1]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[1]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[1]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[1]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[1]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[1]) row.append(seg_pitch_sum) row.append(transpose_pitch[2][i]) seg_pitch_avg= get_avg(transpose_pitch[2]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[2]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[2]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[2]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[2]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[2]) row.append(seg_pitch_sum) row.append(transpose_pitch[3][i]) seg_pitch_avg= get_avg(transpose_pitch[3]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[3]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[3]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[3]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[3]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[3]) row.append(seg_pitch_sum) row.append(transpose_pitch[4][i]) seg_pitch_avg= get_avg(transpose_pitch[4]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[4]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[4]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[4]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[4]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[4]) row.append(seg_pitch_sum) row.append(transpose_pitch[5][i]) seg_pitch_avg= get_avg(transpose_pitch[5]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[5]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[5]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[5]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[5]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[5]) row.append(seg_pitch_sum) row.append(transpose_pitch[6][i]) seg_pitch_avg= get_avg(transpose_pitch[6]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[6]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[6]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[6]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[6]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[6]) row.append(seg_pitch_sum) row.append(transpose_pitch[7][i]) seg_pitch_avg= get_avg(transpose_pitch[7]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[7]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[7]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[7]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[7]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[7]) row.append(seg_pitch_sum) row.append(transpose_pitch[8][i]) seg_pitch_avg= get_avg(transpose_pitch[8]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[8]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[8]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[8]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[8]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[8]) row.append(seg_pitch_sum) row.append(transpose_pitch[9][i]) seg_pitch_avg= get_avg(transpose_pitch[9]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[9]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[9]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[9]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[9]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[9]) row.append(seg_pitch_sum) row.append(transpose_pitch[10][i]) seg_pitch_avg= get_avg(transpose_pitch[10]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[10]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[10]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[10]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[10]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[10]) row.append(seg_pitch_sum) row.append(transpose_pitch[11][i]) seg_pitch_avg= get_avg(transpose_pitch[11]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[11]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[11]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[11]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[11]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[11]) row.append(seg_pitch_sum) #timbre arrays seg_timbre = hdf5_getters.get_segments_timbre(h5) transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows row.append(transpose_timbre[0][i]) seg_timbre_avg= get_avg(transpose_timbre[0]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[0]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[0]) row.append(seg_timbre_min) seg_timbre_stddev=get_stddev(transpose_timbre[0]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[0]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[0]) row.append(seg_timbre_sum) row.append(transpose_timbre[1][i]) seg_timbre_avg= get_avg(transpose_timbre[1]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[1]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[1]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[1]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[1]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[1]) row.append(seg_timbre_sum) row.append(transpose_timbre[2][i]) seg_timbre_avg= get_avg(transpose_timbre[2]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[2]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[2]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[2]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[2]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[2]) row.append(seg_timbre_sum) row.append(transpose_timbre[3][i]) seg_timbre_avg= get_avg(transpose_timbre[3]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[3]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[3]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[3]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[3]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[3]) row.append(seg_timbre_sum) row.append(transpose_timbre[4][i]) seg_timbre_avg= get_avg(transpose_timbre[4]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[4]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[4]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[4]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[4]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[4]) row.append(seg_timbre_sum) row.append(transpose_timbre[5][i]) seg_timbre_avg= get_avg(transpose_timbre[5]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[5]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[5]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[5]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[5]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[5]) row.append(seg_timbre_sum) row.append(transpose_timbre[6][i]) seg_timbre_avg= get_avg(transpose_timbre[6]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[6]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[6]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[6]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[6]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[6]) row.append(seg_timbre_sum) row.append(transpose_timbre[7][i]) seg_timbre_avg= get_avg(transpose_timbre[7]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[7]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[7]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[7]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[7]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[7]) row.append(seg_timbre_sum) row.append(transpose_timbre[8][i]) seg_timbre_avg= get_avg(transpose_timbre[8]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[8]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[8]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[8]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[8]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[8]) row.append(seg_timbre_sum) row.append(transpose_timbre[9][i]) seg_timbre_avg= get_avg(transpose_timbre[9]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[9]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[9]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[9]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[9]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[9]) row.append(seg_timbre_sum) row.append(transpose_timbre[10][i]) seg_timbre_avg= get_avg(transpose_timbre[10]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[10]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[10]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[10]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[10]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[10]) row.append(seg_timbre_sum) row.append(transpose_timbre[11][i]) seg_timbre_avg= get_avg(transpose_timbre[11]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[11]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[11]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[11]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[11]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[11]) row.append(seg_timbre_sum) for item in row_seg2_padding: row.append(item) writer.writerow(row) row=[] row=gral_info[:] # "--------tatums---------------" tatms_c = hdf5_getters.get_tatums_confidence(h5) group_index=5 row_front=padding(245) #blank spaces left in front of tatums row=[] row=gral_info[:] for i,item in enumerate(tatms_c): row.append(group_index) row.append(i) for item in row_front: #appending blank spaces at the front of the row row.append(item) row.append(tatms_c[i]) tatms_c_avg= get_avg(tatms_c) row.append(tatms_c_avg) tatms_c_max= get_max(tatms_c) row.append(tatms_c_max) tatms_c_min = get_min(tatms_c) row.append(tatms_c_min) tatms_c_stddev= get_stddev(tatms_c) row.append(tatms_c_stddev) tatms_c_count = get_count(tatms_c) row.append(tatms_c_count) tatms_c_sum = get_sum(tatms_c) row.append(tatms_c_sum) tatms_start = hdf5_getters.get_tatums_start(h5) row.append(tatms_start[i]) tatms_start_avg= get_avg(tatms_start) row.append(tatms_start_avg) tatms_start_max= get_max(tatms_start) row.append(tatms_start_max) tatms_start_min = get_min(tatms_start) row.append(tatms_start_min) tatms_start_stddev= get_stddev(tatms_start) row.append(tatms_start_stddev) tatms_start_count = get_count(tatms_start) row.append(tatms_start_count) tatms_start_sum = get_sum(tatms_start) row.append(tatms_start_sum) writer.writerow(row) row=[] row=gral_info[:] transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg=[] seg_pitch_max=[] seg_pitch_min=[] seg_pitch_stddev=[] seg_pitch_count=[] seg_pitch_sum=[] i=0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i=i+1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg=[] seg_timbre_max=[] seg_timbre_min=[] seg_timbre_stddev=[] seg_timbre_count=[] seg_timbre_sum=[] i=0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i=i+1 h5.close() count=count+1; print count;
for term in terms: term = term.replace("'","") cursor.execute("SELECT * FROM artist_genres WHERE artist_id='" + artist_id + "' AND genre ='" + term + "'") if cursor.rowcount != 1: cursor.execute("INSERT INTO artist_genres VALUES ('" + artist_id + "','" + term + "')") for tag in mbtags: tag = tag.replace("'","") cursor.execute("SELECT * FROM artist_genres WHERE artist_id='" + artist_id + "' AND genre ='" + tag + "'") if cursor.rowcount != 1: cursor.execute("INSERT INTO artist_genres VALUES ('" + artist_id + "','" + tag + "')") ''' Store track tuples ''' track_id = h.get_track_id(h5,0) track_title = h.get_title(h5,0) track_title = track_title.replace("'","") track_album = h.get_release(h5,0) track_album = track_album.replace("'","") track_duration = str(h.get_duration(h5,0)) track_year = str(h.get_year(h5,0)) cursor.execute("SELECT * FROM track WHERE track_id = '" + track_id + "'") rs = cursor.fetchall() if cursor.rowcount != 1: cursor.execute("INSERT INTO track VALUES ('" + track_id + "','" + track_title + "','" + artist_id + "','" + artist_name + "','" + track_album + "'," + track_duration + "," + track_year + ");") ''' Store track_analysis tuples ''' print ("Track ID: " + h.get_track_id(h5,0)) track_tempo = str(h.get_tempo(h5,0)) track_key = str(h.get_key(h5,0))
def classify(h5): output_array={} # duration duration=hdf5_getters.get_duration(h5) output_array["duration"]=duration ### ADDED VALUE TO ARRAY # number of bars bars=hdf5_getters.get_bars_start(h5) num_bars=len(bars) output_array["num_bars"]=num_bars ### ADDED VALUE TO ARRAY # mean and variance in bar length bar_length=numpy.ediff1d(bars) variance_bar_length=numpy.var(bar_length) output_array["variance_bar_length"]=variance_bar_length ### ADDED VALUE TO ARRAY # number of beats beats=hdf5_getters.get_beats_start(h5) num_beats=len(beats) output_array["num_beats"]=num_beats ### ADDED VALUE TO ARRAY # mean and variance in beats length beats_length=numpy.ediff1d(beats) variance_beats_length=numpy.var(bar_length) output_array["variance_beats_length"]=variance_beats_length ### ADDED VALUE TO ARRAY # danceability danceability=hdf5_getters.get_danceability(h5) output_array["danceability"]=danceability ### ADDED VALUE TO ARRAY # end of fade in end_of_fade_in=hdf5_getters.get_end_of_fade_in(h5) output_array["end_of_fade_in"]=end_of_fade_in ### ADDED VALUE TO ARRAY # energy energy=hdf5_getters.get_energy(h5) output_array["energy"]=energy ### ADDED VALUE TO ARRAY # key key=hdf5_getters.get_key(h5) output_array["key"]=int(key) ### ADDED VALUE TO ARRAY # loudness loudness=hdf5_getters.get_loudness(h5) output_array["loudness"]=loudness ### ADDED VALUE TO ARRAY # mode mode=hdf5_getters.get_mode(h5) output_array["mode"]=int(mode) ### ADDED VALUE TO ARRAY # number sections sections=hdf5_getters.get_sections_start(h5) num_sections=len(sections) output_array["num_sections"]=num_sections ### ADDED VALUE TO ARRAY # mean and variance in sections length sections_length=numpy.ediff1d(sections) variance_sections_length=numpy.var(sections) output_array["variance_sections_length"]=variance_sections_length ### ADDED VALUE TO ARRAY # number segments segments=hdf5_getters.get_segments_start(h5) num_segments=len(segments) output_array["num_segments"]=num_segments ### ADDED VALUE TO ARRAY # mean and variance in segments length segments_length=numpy.ediff1d(segments) variance_segments_length=numpy.var(segments) output_array["variance_segments_length"]=variance_segments_length ### ADDED VALUE TO ARRAY # segment loudness max segment_loudness_max_array=hdf5_getters.get_segments_loudness_max(h5) segment_loudness_max_time_array=hdf5_getters.get_segments_loudness_max_time(h5) segment_loudness_max_index=0 for i in range(len(segment_loudness_max_array)): if segment_loudness_max_array[i]>segment_loudness_max_array[segment_loudness_max_index]: segment_loudness_max_index=i segment_loudness_max=segment_loudness_max_array[segment_loudness_max_index] segment_loudness_max_time=segment_loudness_max_time_array[segment_loudness_max_index] output_array["segment_loudness_max"]=segment_loudness_max ### ADDED VALUE TO ARRAY output_array["segment_loudness_time"]=segment_loudness_max_time ### ADDED VALUE TO ARRAY # POSSIBLE TODO: use average function instead and weight by segment length # segment loudness mean (start) segment_loudness_array=hdf5_getters.get_segments_loudness_start(h5) segment_loudness_mean=numpy.mean(segment_loudness_array) output_array["segment_loudness_mean"]=segment_loudness_mean ### ADDED VALUE TO ARRAY # segment loudness variance (start) segment_loudness_variance=numpy.var(segment_loudness_array) output_array["segment_loudness_variance"]=segment_loudness_variance ### ADDED VALUE TO ARRAY # segment pitches segment_pitches_array=hdf5_getters.get_segments_pitches(h5) segment_pitches_mean=numpy.mean(segment_pitches_array,axis=0).tolist() output_array["segment_pitches_mean"]=segment_pitches_mean # segment pitches variance (start) segment_pitches_variance=numpy.var(segment_pitches_array,axis=0).tolist() output_array["segment_pitches_variance"]=segment_pitches_variance # segment timbres segment_timbres_array=hdf5_getters.get_segments_timbre(h5) segment_timbres_mean=numpy.mean(segment_timbres_array,axis=0).tolist() output_array["segment_timbres_mean"]=segment_timbres_mean # segment timbres variance (start) segment_timbres_variance=numpy.var(segment_timbres_array,axis=0).tolist() output_array["segment_timbres_variance"]=segment_timbres_variance # hotttnesss hottness=hdf5_getters.get_song_hotttnesss(h5,0) output_array["hottness"]=hottness ### ADDED VALUE TO ARRAY # duration-start of fade out start_of_fade_out=hdf5_getters.get_start_of_fade_out(h5) fade_out=duration-start_of_fade_out output_array["fade_out"]=fade_out ### ADDED VALUE TO ARRAY # tatums tatums=hdf5_getters.get_tatums_start(h5) num_tatums=len(tatums) output_array["num_tatums"]=num_tatums ### ADDED VALUE TO ARRAY # mean and variance in tatums length tatums_length=numpy.ediff1d(tatums) variance_tatums_length=numpy.var(tatums_length) output_array["variance_tatums_length"]=variance_tatums_length ### ADDED VALUE TO ARRAY # tempo tempo=hdf5_getters.get_tempo(h5) output_array["tempo"]=tempo ### ADDED VALUE TO ARRAY # time signature time_signature=hdf5_getters.get_time_signature(h5) output_array["time_signature"]=int(time_signature) ### ADDED VALUE TO ARRAY # year year=hdf5_getters.get_year(h5) output_array["year"]=int(year) ### ADDED VALUE TO ARRAY # artist terms artist_terms=hdf5_getters.get_artist_terms(h5,0) output_array["artist_terms"]=artist_terms.tolist() artist_terms_freq=hdf5_getters.get_artist_terms_freq(h5,0) output_array["artist_terms_freq"]=artist_terms_freq.tolist() artist_name=hdf5_getters.get_artist_name(h5,0) output_array["artist_name"]=artist_name artist_id=hdf5_getters.get_artist_id(h5,0) output_array["artist_id"]=artist_id # title title=hdf5_getters.get_title(h5,0) output_array["title"]=title return output_array
print 'Pass it as a flag: -7digitalkey KEY' print 'or set it under environment variable: DIGITAL7_API_KEY' sys.exit(0) if not os.path.isfile(h5path): print 'invalid path (not a file):',h5path sys.exit(0) # open h5 song, get all we know about the song h5 = hdf5_utils.open_h5_file_read(h5path) track_7digitalid = GETTERS.get_track_7digitalid(h5) release_7digitalid = GETTERS.get_release_7digitalid(h5) artist_7digitalid = GETTERS.get_artist_7digitalid(h5) artist_name = GETTERS.get_artist_name(h5) release_name = GETTERS.get_release(h5) track_name = GETTERS.get_title(h5) h5.close() # we already have the 7digital track id? way too easy! if track_7digitalid >= 0: preview = get_preview_from_trackid(track_7digitalid) if preview == '': print 'something went wrong when looking by track id' else: print preview sys.exit(0) # we have the release id? get all tracks, find the closest match if release_7digitalid >= 0: tracks_name_ids = get_tracks_from_releaseid(release_7digitalid) if tracks_name_ids is None:
track = {} #Handle each one year = h5get.get_year(h5, i) if year < 1980 or year > 2010: continue; song = Song() #song.year = year #song.hotness = h5get.get_song_hotttnesss(h5, i) #print "Hotness: ", song.hotness; #if math.isnan(song.hotness): # song.hotness = 0.0; song.artist = h5get.get_artist_name(h5, i) song.name = h5get.get_title(h5, i) #track['track'] = str(song.artist) + " " + str(song.name) #track['hotness'] = float(song.hotness) track['artist'] = song.artist track['name'] = song.name song_list.append(track) #song.pop_score = calc_poffpop(song) #print "Poff Score", song.pop_score #all_songs.append(song) #print all_songs json.dump(song_list,w) w.close() """ sorted(all_songs, key=lambda song: song.pop_score)
def data_to_flat_file(basedir,ext='.h5') : """This function extract the information from the tables and creates the flat file.""" count = 0; #song counter list_to_write= [] row_to_write = "" writer = csv.writer(open("metadata_wholeA.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: print f #the name of the file h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title= title.replace('"','') comma=title.find(',') #eliminating commas in the title if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album= album.replace('"','') #eliminating commas in the album comma=album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma=artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name= artist_name.replace('"','') #eliminating double quotes duration = hdf5_getters.get_duration(h5) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam=-1 artist_hotness= hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness=-1 artist_id = hdf5_getters.get_artist_id(h5) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat=-1 artist_loc = hdf5_getters.get_artist_location(h5) #checks artist_loc to see if it is a hyperlink if it is set as empty string artist_loc = artist_loc.replace(",", "\,"); if artist_loc.startswith("<a"): artist_loc = "" if len(artist_loc) > 100: artist_loc = "" artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon=-1 artist_mbid = hdf5_getters.get_artist_mbid(h5) artist_pmid = hdf5_getters.get_artist_playmeid(h5) audio_md5 = hdf5_getters.get_audio_md5(h5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability=-1 end_fade_in =hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in=-1 energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy=-1 song_key = hdf5_getters.get_key(h5) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c=-1 loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness=-1 mode = hdf5_getters.get_mode(h5) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf=-1 release_7digitalid = hdf5_getters.get_release_7digitalid(h5) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot=-1 song_id = hdf5_getters.get_song_id(h5) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo=-1 time_sig = hdf5_getters.get_time_signature(h5) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c=-1 track_id = hdf5_getters.get_track_id(h5) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) year = hdf5_getters.get_year(h5) bars_c = hdf5_getters.get_bars_confidence(h5) bars_c_avg= get_avg(bars_c) bars_c_max= get_max(bars_c) bars_c_min = get_min(bars_c) bars_c_stddev= get_stddev(bars_c) bars_c_count = get_count(bars_c) bars_c_sum = get_sum(bars_c) bars_start = hdf5_getters.get_bars_start(h5) bars_start_avg = get_avg(bars_start) bars_start_max= get_max(bars_start) bars_start_min = get_min(bars_start) bars_start_stddev= get_stddev(bars_start) bars_start_count = get_count(bars_start) bars_start_sum = get_sum(bars_start) beats_c = hdf5_getters.get_beats_confidence(h5) beats_c_avg= get_avg(beats_c) beats_c_max= get_max(beats_c) beats_c_min = get_min(beats_c) beats_c_stddev= get_stddev(beats_c) beats_c_count = get_count(beats_c) beats_c_sum = get_sum(beats_c) beats_start = hdf5_getters.get_beats_start(h5) beats_start_avg = get_avg(beats_start) beats_start_max= get_max(beats_start) beats_start_min = get_min(beats_start) beats_start_stddev= get_stddev(beats_start) beats_start_count = get_count(beats_start) beats_start_sum = get_sum(beats_start) sec_c = hdf5_getters.get_sections_confidence(h5) sec_c_avg= get_avg(sec_c) sec_c_max= get_max(sec_c) sec_c_min = get_min(sec_c) sec_c_stddev= get_stddev(sec_c) sec_c_count = get_count(sec_c) sec_c_sum = get_sum(sec_c) sec_start = hdf5_getters.get_sections_start(h5) sec_start_avg = get_avg(sec_start) sec_start_max= get_max(sec_start) sec_start_min = get_min(sec_start) sec_start_stddev= get_stddev(sec_start) sec_start_count = get_count(sec_start) sec_start_sum = get_sum(sec_start) seg_c = hdf5_getters.get_segments_confidence(h5) seg_c_avg= get_avg(seg_c) seg_c_max= get_max(seg_c) seg_c_min = get_min(seg_c) seg_c_stddev= get_stddev(seg_c) seg_c_count = get_count(seg_c) seg_c_sum = get_sum(seg_c) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) seg_loud_max_avg= get_avg(seg_loud_max) seg_loud_max_max= get_max(seg_loud_max) seg_loud_max_min = get_min(seg_loud_max) seg_loud_max_stddev= get_stddev(seg_loud_max) seg_loud_max_count = get_count(seg_loud_max) seg_loud_max_sum = get_sum(seg_loud_max) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5) seg_loud_max_time_avg= get_avg(seg_loud_max_time) seg_loud_max_time_max= get_max(seg_loud_max_time) seg_loud_max_time_min = get_min(seg_loud_max_time) seg_loud_max_time_stddev= get_stddev(seg_loud_max_time) seg_loud_max_time_count = get_count(seg_loud_max_time) seg_loud_max_time_sum = get_sum(seg_loud_max_time) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) seg_loud_start_avg= get_avg(seg_loud_start) seg_loud_start_max= get_max(seg_loud_start) seg_loud_start_min = get_min(seg_loud_start) seg_loud_start_stddev= get_stddev(seg_loud_start) seg_loud_start_count = get_count(seg_loud_start) seg_loud_start_sum = get_sum(seg_loud_start) seg_pitch = hdf5_getters.get_segments_pitches(h5) pitch_size = len(seg_pitch) seg_start = hdf5_getters.get_segments_start(h5) seg_start_avg= get_avg(seg_start) seg_start_max= get_max(seg_start) seg_start_min = get_min(seg_start) seg_start_stddev= get_stddev(seg_start) seg_start_count = get_count(seg_start) seg_start_sum = get_sum(seg_start) seg_timbre = hdf5_getters.get_segments_timbre(h5) tatms_c = hdf5_getters.get_tatums_confidence(h5) tatms_c_avg= get_avg(tatms_c) tatms_c_max= get_max(tatms_c) tatms_c_min = get_min(tatms_c) tatms_c_stddev= get_stddev(tatms_c) tatms_c_count = get_count(tatms_c) tatms_c_sum = get_sum(tatms_c) tatms_start = hdf5_getters.get_tatums_start(h5) tatms_start_avg= get_avg(tatms_start) tatms_start_max= get_max(tatms_start) tatms_start_min = get_min(tatms_start) tatms_start_stddev= get_stddev(tatms_start) tatms_start_count = get_count(tatms_start) tatms_start_sum = get_sum(tatms_start) #Getting the genres genre_set = 0 #flag to see if the genre has been set or not art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes=get_genre_indexes(trm_freq) #index of the highest freq final_genre=[] genres_so_far=[] for i in range(len(genre_indexes)): genre_tmp=get_genre(art_trm,genre_indexes[i]) #genre that corresponds to the highest freq genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set=1 #genre was found in dictionary if genre_set == 1: col_num=[] for genre in final_genre: column=int(genre) #getting the column number of the genre col_num.append(column) genre_array=genre_columns(col_num) #genre array else: genre_array=genre_columns(-1) #the genre was not found in the dictionary transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg=[] seg_pitch_max=[] seg_pitch_min=[] seg_pitch_stddev=[] seg_pitch_count=[] seg_pitch_sum=[] i=0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i=i+1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg=[] seg_timbre_max=[] seg_timbre_min=[] seg_timbre_stddev=[] seg_timbre_count=[] seg_timbre_sum=[] i=0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i=i+1 #Writing to the flat file writer.writerow([title,album,artist_name,year,duration,seg_start_count, tempo]) h5.close() count=count+1; print count;
import hdf5_getters # This script converts the summary H5 files only 300MB to a csv file # Run only on the Master Node since h5_getters cannot open a remote(ie. HDFS) file if __name__ == "__main__": with open("fields.csv", "wb") as f: writer = csv.writer(f) # initialize the csv writer # for each track in the summary file, get the 11 fields and output to csv h5_file = hdf5_getters.open_h5_file_read("msd_summary_file.h5") for k in range(1000000): print "index!!!: ", k id = hdf5_getters.get_track_id(h5_file, k) # get track_id TRA13e39.. title = hdf5_getters.get_title(h5_file, k) # get song title artist_name = hdf5_getters.get_artist_name(h5_file, k) year = int(hdf5_getters.get_year(h5_file, k)) hotness = float(hdf5_getters.get_song_hotttnesss(h5_file, k)) artist_familiarity = float(hdf5_getters.get_artist_familiarity(h5_file, k)) f5 = int(hdf5_getters.get_key(h5_file, k)) # get key f2 = float(hdf5_getters.get_loudness(h5_file, k)) # get loudness f1 = float(hdf5_getters.get_tempo(h5_file, k)) # get tempo f4 = int(hdf5_getters.get_duration(h5_file, k)) # get duration f3 = float(hdf5_getters.get_time_signature(h5_file, k)) # get time signature # Get rid of missing info and change invalid numbers for meta data if not artist_name: artist_name = "unknown"
def func_to_extract_features(filename): """ This function does 3 simple things: - open the song file - get artist ID and put it - close the file """ global cntnan global listfeatures cf = [] h5 = GETTERS.open_h5_file_read(filename) nanfound = 0 #Get target feature: song hotness #FEATURE 0 song_hotness = GETTERS.get_song_hotttnesss(h5) if math.isnan(song_hotness): nanfound = 1 cntnan = cntnan + 1 h5.close() return 0 elif song_hotness > 0.3 and song_hotness < 0.6: h5.close() return 0 else: if song_hotness <= 0.3: hotness_class = 0 elif song_hotness >= 0.6: hotness_class = 1 cf.append(hotness_class) #FEATURE 1 #Get song loudness song_loudness = GETTERS.get_loudness(h5) if math.isnan(song_loudness): nanfound = 1 cntnan = cntnan + 1 else: #cf.append(song_loudness) pass #FEATURE 2 #Get key of the song song_key = GETTERS.get_key(h5) if math.isnan(song_key): nanfound = 1 cntnan = cntnan + 1 else: # cf.append(song_key) pass #FEATURE 3 song_duration = GETTERS.get_duration(h5) if math.isnan(song_duration): nanfound = 1 cntnan = cntnan + 1 else: # cf.append(song_duration) pass #Feature 4 #Get song tempo song_tempo = GETTERS.get_tempo(h5) if math.isnan(song_tempo): nanfound = 1 cntnan = cntnan + 1 else: # cf.append(song_tempo) pass #Feature 5: artist familarity artist_familiarity = GETTERS.get_artist_familiarity(h5) if math.isnan(artist_familiarity): nanfound = 1 cntnan = cntnan + 1 else: # cf.append(artist_familiarity) pass #Feature 6: artist_hotness artist_hotness = GETTERS.get_artist_hotttnesss(h5) if math.isnan(artist_hotness): nanfound = 1 cntnan = cntnan + 1 else: # cf.append(artist_hotness) pass #Feature 7 time signature time_signature = GETTERS.get_time_signature(h5) # cf.append(time_signature) #Feature 8 #Loudness COV loudness_segments = np.array(GETTERS.get_segments_loudness_max(h5)) loudness_cov = abs(variation(loudness_segments)) if math.isnan(loudness_cov): nanfound = 1 cntnan = cntnan + 1 else: # cf.append(loudness_cov) pass #Feature 9 #Beat COV beat_segments = np.array(GETTERS.get_beats_start(h5)) beat_cov = abs(variation(beat_segments)) if math.isnan(beat_cov): nanfound = 1 cntnan = cntnan + 1 else: # cf.append(beat_cov) pass #Feature 10 #Year song_year = GETTERS.get_year(h5) if song_year == 0: nanfound = 1 cntnan = cntnan + 1 else: # cf.append(song_year) pass title = GETTERS.get_title(h5) if title in energydict: audio_summary = energydict[title] energy = audio_summary['energy'] danceability = audio_summary['danceability'] speechiness = audio_summary['speechiness'] liveness = audio_summary['liveness'] else: stitle = re.sub(r'\([^)]*\)','', title) if stitle in energydict: audio_summary = energydict[stitle] energy = audio_summary['energy'] danceability = audio_summary['danceability'] speechiness = audio_summary['speechiness'] liveness = audio_summary['liveness'] else: energy = 0.0 danceability = 0.0 speechiness = 0.0 liveness = 0.0 # Feature 11 cf.append(energy) # Feature 12 # cf.append(danceability) # Feature 13 # cf.append(speechiness) # Feature 14 # cf.append(liveness) if nanfound == 0: strlist = list_to_csv(cf) listfeatures.append(strlist) h5.close()
def get_song_info(h5): print '%s - %s | (%s) | %s bpm' % (hdf5_getters.get_artist_name(h5), hdf5_getters.get_title(h5), hdf5_getters.get_year(h5), hdf5_getters.get_tempo(h5))