def feat_from_file(path):
    """
    Extract a list of features in an array, already converted to string
    """
    feats = []
    h5 = GETTERS.open_h5_file_read(path)
    # basic info
    feats.append(GETTERS.get_track_id(h5))
    feats.append(GETTERS.get_artist_name(h5).decode().replace(',', ''))
    feats.append(GETTERS.get_title(h5).decode().replace(',', ''))
    feats.append(GETTERS.get_loudness(h5))
    feats.append(GETTERS.get_tempo(h5))
    feats.append(GETTERS.get_time_signature(h5))
    feats.append(GETTERS.get_key(h5))
    feats.append(GETTERS.get_mode(h5))
    feats.append(GETTERS.get_duration(h5))
    # timbre
    timbre = GETTERS.get_segments_timbre(h5)
    avg_timbre = np.average(timbre, axis=0)
    for k in avg_timbre:
        feats.append(k)
    var_timbre = np.var(timbre, axis=0)
    for k in var_timbre:
        feats.append(k)
    # done with h5 file
    h5.close()
    # makes sure we return strings
    feats = map(lambda x: str(x), feats)
    return feats
def load_non_time_data():
    years = []
    ten_features=[]
    num = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            h5 = getter.open_h5_file_read(f)
            num += 1
            print(num)
            try:
                year = getter.get_year(h5)
                if year!=0:
                    years.append(year)
                    title_length = len(getter.get_title(h5))
                    terms_length = len(getter.get_artist_terms(h5))
                    tags_length = len(getter.get_artist_mbtags(h5))
                    hotness = getter.get_artist_hotttnesss(h5)
                    duration = getter.get_duration(h5)
                    loudness = getter.get_loudness(h5)
                    mode = getter.get_mode(h5)
                    release_length = len(getter.get_release(h5))
                    tempo = getter.get_tempo(h5)
                    name_length = len(getter.get_artist_name(h5))
                    ten_feature = np.hstack([title_length,tags_length, hotness, duration,
                                             terms_length, loudness, mode, release_length, tempo, name_length])
                    ten_features.append(ten_feature) 
            except:
                print(1)
            h5.close()
    return years,ten_features
Пример #3
0
def get_key_tempo(filename):
    h5 = GETTERS.open_h5_file_read(filename)
    tempo = GETTERS.get_tempo(h5)
    key = GETTERS.get_key(h5)
    ar = GETTERS.get_artist_name(h5)
    title = GETTERS.get_title(h5)

    st = ""
    terms = None
    try:
        a = artist.Artist(str(ar))
        terms = a.get_terms()
        time.sleep(.12)
    except EchoNestIOError as e:
        print "echonestIOerror"
    except EchoNestAPIError as e:
        if e.code == 3:
            time.sleep(1)
        elif e.code == 5:
            print "code is 5"
        else:
            print "error.."
    if terms:
        print terms[0]['name']
        with open('points.csv', 'a') as fp:
            a = csv.writer(fp, delimiter=',')
            a.writerow([tempo, key, ar, title, terms[0]['name']])
    h5.close()
def insert_singer():
    print('Inserting singer tuples')
    conn = get_conn()
    cursor = get_cursor(conn)

    unique = set()
    __name = None
    __singer_id = None
    __initial_hotness = None

    try:
        for i in range(hard.NUM_SINGERS):
            __singer_id = bytes2str(GETTERS.get_artist_id(h5, i))
            if __singer_id not in unique:
                unique.add(__singer_id)
                __name = bytes2str(GETTERS.get_artist_name(h5, i))
                __initial_hotness = 0
                cursor.execute(sql.INSERT_SINGER,
                        id = __singer_id, name = __name,
                        hotness = __initial_hotness)
        return 0
    except Exception as e:
        print(e, 'insert singer tuple error')
        print('name:',__name, 'singer_id:',__singer_id, 'hotness:',__initial_hotness)
        return -1
    finally:
        conn.commit()
        close_all(conn, cursor)
Пример #5
0
def debug_from_song_file(connect, h5path, verbose=0):
    """
    Slow debugging function that takes a h5 file, reads the info,
    check the match with musicbrainz db, prints out the result.
    Only prints when we dont get exact match!
    RETURN counts of how many files we filled for years, tags
    """
    import hdf5_utils as HDF5
    import hdf5_getters as GETTERS
    h5 = HDF5.open_h5_file_read(h5path)
    title = GETTERS.get_title(h5)
    release = GETTERS.get_release(h5)
    artist = GETTERS.get_artist_name(h5)
    ambid = GETTERS.get_artist_mbid(h5)
    h5.close()
    # mbid
    gotmbid = 1
    if ambid == '':
        gotmbid = 0
        if verbose > 0: print('no mb id for:', artist)
    # year
    year = find_year_safemode(connect, ambid, title, release, artist)
    gotyear = 1 if year > 0 else 0
    if verbose > 0: print('no years for:', artist, '|', release, '|', title)
    # tags
    tags, counts = get_artist_tags(connect, ambid)
    gottags = 1 if len(tags) > 0 else 0
    if gottags == 0 and verbose > 0: print('no tags for:', artist)
    # return indicator for mbid, year, tag
    return gotmbid, gotyear, gottags
Пример #6
0
def feat_from_file(path):
    
    feats = []
    h5 = GETTERS.open_h5_file_read(path)
    
    feats.append( GETTERS.get_track_id(h5) )
    feats.append( GETTERS.get_title(h5) )
    feats.append( GETTERS.get_artist_name(h5) )
    feats.append( GETTERS.get_year(h5) )
    feats.append( GETTERS.get_loudness(h5) )
    feats.append( GETTERS.get_tempo(h5) )
    feats.append( GETTERS.get_time_signature(h5) )
    feats.append( GETTERS.get_key(h5) )
    feats.append( GETTERS.get_mode(h5) )
    feats.append( GETTERS.get_duration(h5) )
    
    #timbre
    timbre = GETTERS.get_segments_timbre(h5)
    avg_timbre = np.average(timbre, axis=0)
    for k in avg_timbre:
        feats.append(k)
    var_timbre = np.var(timbre, axis=0)
    for k in var_timbre:
        feats.append(k)

    h5.close()
    
    return feats
Пример #7
0
def fetch_song_from_h5(h5_filepath):
    basename, extension = os.path.splitext(h5_filepath)
    if extension != '.h5':
        return
    audio_filepath = basename + '.mp3'
    h5 = hdf5_getters.open_h5_file_read(h5_filepath)
    track_id = hdf5_getters.get_track_7digitalid(h5)
    track_name = hdf5_getters.get_title(h5)
    artist_name = hdf5_getters.get_artist_name(h5)
    h5.close()

    consumer = oauth.Consumer(OAUTH_CLIENT_KEY, OAUTH_CLIENT_SECRET)
    token = oauth.Token(OAUTH_ACCESS_TOKEN, OAUTH_ACCESS_SECRET)
    request = oauth.Request.from_consumer_and_token(
        consumer,
        http_url=get_clip_url(track_id),
        is_form_encoded=True,
        parameters={'country': 'ww'})
    signing_method = oauth.SignatureMethod_HMAC_SHA1()
    request.sign_request(signing_method, consumer, token)
    url = request.to_url()
    r = requests.get(url)
    if r.status_code not in (requests.codes.ok, requests.codes.not_found):
        print(r.status_code, r.headers, r.content)
        exit()
    if r.status_code == requests.codes.ok:
        print('FETCHED track {0} {1} {2}'.format(
            track_id, artist_name, track_name))
        with open(audio_filepath, 'wb') as f:
            f.write(r.content)
    else:
        print('FAILED TO FETCH track {0} {1} {2}'.format(
            track_id, artist_name, track_name))
Пример #8
0
def debug_from_song_file(connect,h5path,verbose=0):
    """
    Slow debugging function that takes a h5 file, reads the info,
    check the match with musicbrainz db, prints out the result.
    Only prints when we dont get exact match!
    RETURN counts of how many files we filled for years, tags
    """
    import hdf5_utils as HDF5
    import hdf5_getters as GETTERS
    h5 = HDF5.open_h5_file_read(h5path)
    title = GETTERS.get_title(h5)
    release = GETTERS.get_release(h5)
    artist = GETTERS.get_artist_name(h5)
    ambid = GETTERS.get_artist_mbid(h5)
    h5.close()
    # mbid
    gotmbid=1
    if ambid=='':
        gotmbid = 0
        if verbose>0: print('no mb id for:',artist)
    # year
    year = find_year_safemode(connect,ambid,title,release,artist)
    gotyear = 1 if year > 0 else 0
    if verbose>0: print('no years for:',artist,'|',release,'|',title)
    # tags
    tags,counts = get_artist_tags(connect,ambid)
    gottags = 1 if len(tags) > 0 else 0
    if gottags == 0 and verbose>0: print('no tags for:',artist)
    # return indicator for mbid, year, tag
    return gotmbid,gotyear,gottags
def get_all_titles(basedir,ext='.h5') :
    titles = []
    artist_names = []
    terms = []
    loudness = []
    segments_loudness_max = []
    
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            h5 = hdf5_getters.open_h5_file_read(f)
            
            titles.append(hdf5_getters.get_title(h5)) 
            artist_names.append(hdf5_getters.get_artist_name(h5))
            try:
                terms.append(hdf5_getters.get_artist_terms(h5))
            except:          
                pass
            loudness.append(hdf5_getters.get_loudness(h5))
            try:
                segments_loudness_max.append(hdf5_getters.get_segments_loudness_max(h5))
            except:              
                pass
                        
            h5.close()
    return titles, artist_names, terms, loudness, segments_loudness_max
Пример #10
0
def create_labels(songs, sp):
    # Songs now holds list of all file paths to each song as a string
    """
    Goes through all songs in list to find the danceability from spotify

    param: the list of song absolute file path names, and the Spotify object to
    use to make calls
    returns: a [num_songs] array of danceability labels (-1,0,1)
    """
    print("creating labels...")
    acc = 0
    labels = []
    broken_labels = []
    for i in range(1,len(songs)):
        print(i)
        file_object= hdf.open_h5_file_read(songs[i])
        artist_name = hdf.get_artist_name(file_object).decode("utf-8")
        title = re.sub(r"\(.*\)","",hdf.get_title(file_object).decode("utf-8"))
        query = "artist: " + artist_name + " track: " + title
        label = get_danceability(query, sp)
        if label != -1:
            labels.append(label)
        else:
            broken_labels.append(i)
            acc+=1
        file_object.close()
    print("NUMBER OF LOST SONGS = ", acc)
    return np.array(labels, dtype=np.int32), np.array(broken_labels, dtype=np.int32)
Пример #11
0
    def process_song(self, song_path):

        song_data = h5.open_h5_file_read(song_path)

        song_id = h5.get_song_id(song_data).decode('UTF-8')
        song_int_id = int(h5.get_track_7digitalid(song_data))
        song_name = h5.get_title(song_data).decode('UTF-8').lower()
        artist_name = h5.get_artist_name(song_data).decode('UTF-8').lower()
        song_year = int(h5.get_year(song_data))

        timbre = self.ndarray_list_to_ndlist(h5.get_segments_timbre(song_data))
        chroma = self.ndarray_list_to_ndlist(
            h5.get_segments_pitches(song_data))

        song_data.close()
        song_dict = {
            'id': song_int_id,
            'source_id': song_id,
            'name': song_name,
            'artist': artist_name,
            'year': song_year,
            'timbre': timbre,
            'chroma': chroma
        }
        return song_dict
Пример #12
0
def get_info(basedir,ext='.h5') :
    # Create new text file for storing the result of JSON objects
    resultFile = open("result.txt", "w")
    # Going through all sub-directories under the base directory
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            # Open the HDF5 for reading the content
            h5 = hdf5_getters.open_h5_file_read(f)
            # Creating dictionary to convert to JSON object
            dictionary = {} 
            # Storing all fields 
            dictionary["song_title"] = hdf5_getters.get_title(h5).decode('Latin-1')
            dictionary["artist_name"] = hdf5_getters.get_artist_name(h5).decode('Latin-1')
            dictionary["key"] = float(hdf5_getters.get_key(h5))
            dictionary["minor-major"] = float(hdf5_getters.get_mode(h5))
            dictionary["hotness"] = hdf5_getters.get_song_hotttnesss(h5)
            dictionary["artist_location"] = hdf5_getters.get_artist_location(h5).decode('Latin-1')
            dictionary["longitude"] = float(hdf5_getters.get_artist_longitude(h5))
            dictionary["latitude"] = float(hdf5_getters.get_artist_latitude(h5))
            print(dictionary)
            # Write the created JSON object to the text file
            resultFile.write(str(json.dumps(dictionary)) + "\n")
            h5.close()
    resultFile.close()
Пример #13
0
    def process_song(self, song_path):
        # read file
        song_data = h5.open_h5_file_read(song_path)

	# process file
        #song_id = h5.get_song_id(song_data).decode('UTF-8')
        song_int_id = int(h5.get_track_7digitalid(song_data))
        song_name = h5.get_title(song_data).decode('UTF-8').lower()
        artist_name = h5.get_artist_name(song_data).decode('UTF-8').lower()
        song_year = int(h5.get_year(song_data))

        sp = SpotifyInterface()
        track_info = sp.search_track_info(artist_name, song_name)

        if track_info == None:
            song_data.close()
            return None

        timbre = self.ndarray_list_to_ndlist(h5.get_segments_timbre(song_data))
        chroma = self.ndarray_list_to_ndlist(h5.get_segments_pitches(song_data))

        song_data.close()

        song_dict = {'id': song_int_id, 'name': song_name, 
                    'artist': artist_name, 'year': song_year, 'timbre': timbre, 
                    'chroma': chroma, **track_info}

        return song_dict
Пример #14
0
def traverseAndWrite(root, genreDirs, genreKeys):
    if not isfile(root):
        for f in listdir(root):
            traverseAndWrite(root + "/" + f,genreDirs, genreKeys)
    else:
        h5 = hdf5_getters.open_h5_file_read(root)
        numOfSongs = hdf5_getters.get_num_songs(h5)
        for index in range(numOfSongs):
            tags = hdf5_getters.get_artist_mbtags(h5,index)
            # print tags
            artist = hdf5_getters.get_artist_name(h5,index)
            songName = hdf5_getters.get_title(h5,index)
            segmentTimbre = hdf5_getters.get_segments_timbre(h5,index)
            segmentPitches = hdf5_getters.get_segments_pitches(h5,index)
            if notValidSong(tags, artist, songName, segmentTimbre, segmentPitches):
                h5.close()
                continue
            for genre in genreKeys:
                if genreInTags(genre,tags):
                    song = {}
                    song['genre'] = genre
                    song['artist_name'] = artist
                    song['song_title'] = songName
                    song['segments_pitches'] = segmentPitches.tolist()
                    song['segments_timbre'] = segmentTimbre.tolist()

                    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
                    songName = ''.join(c for c in songName if c in valid_chars)
                    artist = ''.join(c for c in artist if c in valid_chars)
                    fd = open(genreDirs[genre]+"/"+artist+"--"+songName+".json",'a')
                    writeToDescriptor(fd,song)
                    fd.close()
        h5.close()
Пример #15
0
def func_to_desired_song_data(filename):
    h5 = GETTERS.open_h5_file_read(filename)
    track_id = GETTERS.get_track_id(h5)
    for song in random_songs:
        if song[0] == track_id:
            print("FOUND ONE!")
            title = replace_characters(GETTERS.get_title(h5))
            artist = replace_characters(GETTERS.get_artist_name(h5))
            year = GETTERS.get_year(h5)
            tempo = GETTERS.get_tempo(h5)
            key = GETTERS.get_key(h5)
            loudness = GETTERS.get_loudness(h5)
            energy = GETTERS.get_energy(h5)
            danceability = GETTERS.get_danceability(h5)
            time_signature = GETTERS.get_time_signature(h5)
            mode = GETTERS.get_mode(h5)
            hotttness = GETTERS.get_song_hotttnesss(h5)

            song_data = {
                'title': title,
                'artist': artist,
                'year': year,
                'tempo': tempo,
                'key': key,
                'loudness': loudness,
                'energy': energy,
                'danceability': danceability,
                'time_signature': time_signature,
                'mode': mode,
                'hotttness': hotttness
            }

            all_the_data.append(song_data)

    h5.close()
Пример #16
0
def feat_from_file(path):
    """
    Extract a list of features in an array, already converted to string
    """
    feats = []
    h5 = GETTERS.open_h5_file_read(path)
    # basic info
    feats.append( GETTERS.get_track_id(h5) )
    feats.append( GETTERS.get_artist_name(h5).replace(',','') )
    feats.append( GETTERS.get_title(h5).replace(',','') )
    feats.append( GETTERS.get_loudness(h5) )
    feats.append( GETTERS.get_tempo(h5) )
    feats.append( GETTERS.get_time_signature(h5) )
    feats.append( GETTERS.get_key(h5) )
    feats.append( GETTERS.get_mode(h5) )
    feats.append( GETTERS.get_duration(h5) )
    # timbre
    timbre = GETTERS.get_segments_timbre(h5)
    avg_timbre = np.average(timbre,axis=0)
    for k in avg_timbre:
        feats.append(k)
    var_timbre = np.var(timbre,axis=0)
    for k in var_timbre:
        feats.append(k)
    # done with h5 file
    h5.close()
    # makes sure we return strings
    feats = map(lambda x: str(x), feats)
    return feats
Пример #17
0
def get_all_examples(basedir, genre_dict, ext='.h5'):
    """
    From a base directory, goes through all subdirectories,
    and grabs all songs and their features and puts them into a pandas dataframe 
    INPUT
       basedir    - base directory of the dataset
       genre_dict - a dictionary mapping track id to genre based tagraum dataset
       ext        - extension, .h5 by default
    RETURN
       dataframe containing all song examples
    """
    features_vs_genre = pd.DataFrame()

    # iterate over all files in all subdirectories
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        # # count files
        # count += len(files)
        # apply function to all files
        for f in files:
            h5 = GETTERS.open_h5_file_read(f)
            song_id = GETTERS.get_track_id(h5).decode('utf-8')
            if (song_id in genre_dict):
                genre = genre_dict[song_id]
                year = GETTERS.get_year(h5)
                duration = GETTERS.get_duration(h5)
                end_of_fade_in = GETTERS.get_end_of_fade_in(h5)
                loudness = GETTERS.get_loudness(h5)
                song_hotttnesss = GETTERS.get_song_hotttnesss(h5)
                tempo = GETTERS.get_tempo(h5)
                key = GETTERS.get_key(h5)
                key_confidence = GETTERS.get_key_confidence(h5)
                mode = GETTERS.get_mode(h5)
                mode_confidence = GETTERS.get_mode_confidence(h5)
                time_signature = GETTERS.get_time_signature(h5)
                time_signature_confidence = GETTERS.get_time_signature_confidence(
                    h5)
                artist_name = GETTERS.get_artist_name(h5)
                title = GETTERS.get_title(h5)
                # length of sections_start array gives us number of start
                num_sections = len(GETTERS.get_sections_start(h5))
                num_segments = len(GETTERS.get_segments_confidence(h5))
                example = pd.DataFrame(
                    data=[
                        (artist_name, title, song_id, genre, year, key,
                         key_confidence, mode, mode_confidence, time_signature,
                         time_signature_confidence, duration, end_of_fade_in,
                         loudness, song_hotttnesss, tempo, num_sections)
                    ],
                    columns=[
                        'artist_name', 'title', 'song_id', 'genre', 'year',
                        'key', 'key_confidence', 'mode', 'mode_confidence',
                        'time_signature', 'time_signature_confidence',
                        'duration', 'end_of_fade_in', 'loudness',
                        'song_hotttnesss', 'tempo', 'num_segments'
                    ])
                features_vs_genre = features_vs_genre.append(example)
            h5.close()

    return features_vs_genre
Пример #18
0
def get_attribute(files):
    array = []
    count = 0
    for f in files:
        temp = []
        count += 1
        print(f)
        h5 = hdf5_getters.open_h5_file_read(f)
        temp.append(hdf5_getters.get_num_songs(h5))
        temp.append(hdf5_getters.get_artist_familiarity(h5))
        temp.append(hdf5_getters.get_artist_hotttnesss(h5))
        temp.append(hdf5_getters.get_danceability(h5))
        temp.append(hdf5_getters.get_energy(h5))
        temp.append(hdf5_getters.get_key(h5))
        temp.append(hdf5_getters.get_key_confidence(h5))
        temp.append(hdf5_getters.get_loudness(h5))
        temp.append(hdf5_getters.get_mode(h5))
        temp.append(hdf5_getters.get_mode_confidence(h5))
        temp.append(hdf5_getters.get_tempo(h5))
        temp.append(hdf5_getters.get_time_signature(h5))
        temp.append(hdf5_getters.get_time_signature_confidence(h5))
        temp.append(hdf5_getters.get_title(h5))
        temp.append(hdf5_getters.get_artist_name(h5))
        temp = np.nan_to_num(temp)
        array.append(temp)
        # if count%100 ==0:
        # print(array[count-100:count-1])
        # kmean.fit(array[count-100:count-1])
        h5.close()
    return array
def get_all_titles(basedir, ext='.h5'):
    titles = []
    artist_names = []
    terms = []
    loudness = []
    segments_loudness_max = []

    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        for f in files:
            h5 = hdf5_getters.open_h5_file_read(f)

            titles.append(hdf5_getters.get_title(h5))
            artist_names.append(hdf5_getters.get_artist_name(h5))
            try:
                terms.append(hdf5_getters.get_artist_terms(h5))
            except:
                pass
            loudness.append(hdf5_getters.get_loudness(h5))
            try:
                segments_loudness_max.append(
                    hdf5_getters.get_segments_loudness_max(h5))
            except:
                pass

            h5.close()
    return titles, artist_names, terms, loudness, segments_loudness_max
Пример #20
0
def song_to_artist(if_str):
	songs_tracks = pickle.load(open ("../../msd_dense_subset/dense/songs_tracks.pkl",'r'));
	track = str(songs_tracks[if_str])
	# build path
	path = "../../msd_dense_subset/dense/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5"
	h5 = GETTERS.open_h5_file_read(path)
	artist = GETTERS.get_artist_name(h5)
	h5.close()
	return artist
Пример #21
0
def hdf5_to_features(file_name):
    """
    Receives path to HDF5 file, returns 2 lists of identification for the song
    as well as the features for the algorithm.

    Parameters
    ----------
    file_name : str
        Absolute path to the HDF5 file.

    Returns
    -------
    list1 : list
        List consisting of ID, song title and artist name.

    list2 : list
        34 features to represent the song.
    """

    with hdf5_getters.open_h5_file_read(file_name) as reader:
        # ID
        ID = hdf5_getters.get_song_id(reader)
        title = hdf5_getters.get_title(reader)
        artist = hdf5_getters.get_artist_name(reader)

        # Features 1-4
        beat_starts = hdf5_getters.get_beats_start(reader)
        beat_durations = np.diff(beat_starts, axis=0)
        # try:
        tempo_10, tempo_90 = np.quantile(beat_durations, [0.1, 0.9])
        # except:
        #     print(beat_durations)
        #     exit()
        temp_var = np.var(beat_durations)
        temp_mean = np.mean(beat_durations)

        # Features 5-8
        segment_loudness = hdf5_getters.get_segments_loudness_max(reader)
        loud_10, loud_90 = np.quantile(segment_loudness, [0.1, 0.9])
        loud_var = np.var(segment_loudness)
        loud_mean = np.mean(segment_loudness)

        # Features 9-21
        pitch_dominance = hdf5_getters.get_segments_pitches(reader)
        pitch_means = pitch_dominance.mean(axis=0)
        pitch_var = pitch_means.var()

        # Features 22-34
        timbre = hdf5_getters.get_segments_timbre(reader)
        timbre_means = timbre.mean(axis=0)
        timbre_var = timbre_means.var()

    return [ID, title, artist], [
        tempo_10, tempo_90, temp_var, temp_mean, loud_10, loud_90, loud_var,
        loud_mean
    ] + list(pitch_means) + [pitch_var] + list(timbre_means) + [timbre_var]
def main():
    # print("we in")
    outputFile1 = open('../Datasets/MSDSubsetCSV.csv', 'w')
    csvRowString = ""

    csvRowString = "Title,ArtistName"
    csvAttributeList = re.split(',', csvRowString)
    for i, v in enumerate(csvAttributeList):
        csvAttributeList[i] = csvAttributeList[i].lower()
    csvRowString += ",\n"

    basedir = '/Users/Owner/Desktop/School/2019-2020/COMP400/MillionSongSubset/'
    ext = ".h5"

    #FOR LOOP
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        for f in files:
            print(f)
            songH5File = hdf5_getters.open_h5_file_read(f)
            song = Song(str(hdf5_getters.get_song_id(songH5File)))

            song.title = str(hdf5_getters.get_title(songH5File)).replace(
                "b'", "").lower()
            song.artistName = str(
                hdf5_getters.get_artist_name(songH5File)).replace("b'",
                                                                  "").lower()
            song.year = str(hdf5_getters.get_year(songH5File))
            if (int(song.year) < 1990):
                print('nope', int(song.year))
                continue

            for attribute in csvAttributeList:
                # print "Here is the attribute: " + attribute + " \n"

                if attribute == 'ArtistName'.lower():
                    csvRowString += "\"" + song.artistName.replace(
                        "'", "") + "\""  #took out   "\"" before and after
                elif attribute == 'Title'.lower():
                    csvRowString += "\"" + song.title.replace("'", "") + "\""
                else:
                    csvRowString += "Erm. This didn't work. Error. :( :(\n"

                csvRowString += ","

            #Remove the final comma from each row in the csv
            lastIndex = len(csvRowString)
            csvRowString = csvRowString[0:lastIndex - 1]
            csvRowString += "\n"
            outputFile1.write(csvRowString)
            csvRowString = ""

            songH5File.close()

    outputFile1.close()
Пример #23
0
def func_to_get_artist_name(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    artist_name = GETTERS.get_artist_name(h5)
    all_artist_names.add( artist_name )
    h5.close()
Пример #24
0
def get_track_info(track,h5=None):
    #get song and artist of the track
    close = (h5== None)
    if h5==None:
        path = "../../msd_dense_subset/mood/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5"
        h5 = GETTERS.open_h5_file_read(path)
    artist = GETTERS.get_artist_name(h5)
    title = GETTERS.get_title(h5)
    if close:
        h5.close()
    return str(artist) + '-' + str(title)
Пример #25
0
def func_to_get_artist_name(filename):
   """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
   h5 = GETTERS.open_h5_file_read(filename)
   artist_name = GETTERS.get_artist_name(h5)
   all_artist_names.add( artist_name )
   h5.close()
Пример #26
0
def better_MSD_sample_dirslist(paths):
    """
    get list of filenames, artist, song title for all h5 files in a list of MSD sample directories
    """
    dirdata = []
    for path in paths:
        dirlist = os.listdir(path)
        for fname in dirlist:
            with GETTERS.open_h5_file_read(path+fname) as h5:
                dirdata.append([fname, GETTERS.get_artist_name(h5),GETTERS.get_title(h5)])
    return dirdata
Пример #27
0
 def func_to_get_instrumental(filename):
     h5 = GETTERS.open_h5_file_read(filename)
     tags = set(GETTERS.get_artist_mbtags(h5))
     genres = {'classical', 'orchestral'}
     if tags.intersection(genres):
         d = {}
         d['artist'] = GETTERS.get_artist_name(h5)
         d['title'] = GETTERS.get_title(h5)
         song_id = GETTERS.get_song_id(h5)
         classical[song_id] = d
     h5.close()
Пример #28
0
def song_to_artist(if_str):
    songs_tracks = pickle.load(
        open("../../msd_dense_subset/dense/songs_tracks.pkl", 'r'))
    track = str(songs_tracks[if_str])
    # build path
    path = "../../msd_dense_subset/dense/" + track[2] + "/" + track[
        3] + "/" + track[4] + "/" + track + ".h5"
    h5 = GETTERS.open_h5_file_read(path)
    artist = GETTERS.get_artist_name(h5)
    h5.close()
    return artist
Пример #29
0
def get_track_info(track, h5=None):
    #get song and artist of the track
    close = (h5 == None)
    if h5 == None:
        path = "../../../msd_dense_subset/dense/" + track[2] + "/" + track[
            3] + "/" + track[4] + "/" + track + ".h5"
        h5 = GETTERS.open_h5_file_read(path)
    artist = GETTERS.get_artist_name(h5)
    title = GETTERS.get_title(h5)
    if close:
        h5.close()
    return str(artist) + '-' + str(title)
Пример #30
0
def extract_data(filename):
    h5 = GETTERS.open_h5_file_read(filename)

    track_id = GETTERS.get_song_id(h5)
    if track_id in already:
        h5.close()
        return
    songdata[track_id].append(GETTERS.get_title(h5))
    songdata[track_id].append(GETTERS.get_artist_name(h5))
    songdata[track_id].append(GETTERS.get_duration)

    h5.close()
Пример #31
0
def get_all_titles(basedir, ext='.h5'):
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        for f in files:
            h5 = hdf5_getters.open_h5_file_read(f)
            title = (hdf5_getters.get_title(h5))
            title = re.sub('[^A-Za-z0-9 ]+', '', title)

            name = hdf5_getters.get_artist_name(h5)
            h5.close()
            x, _, _ = ws.billboard(title, name)
            print title, x
Пример #32
0
def MSD_sample_dirlist(path):
    """
    get list of filenames, artist, song title for all h5 files in an MSD sample directory
    """
    dirpath = path
    dirlist = os.listdir(path)
    dirdata = []
    for fname in dirlist:
            h5 = GETTERS.open_h5_file_read(dirpath+fname)
            dirdata.append([fname, GETTERS.get_artist_name(h5),GETTERS.get_title(h5)])
            h5.close()
    return dirdata
Пример #33
0
def getInfo(files):
    infoList = np.array(['tid', 'artist', 'song'])

    for fil in files:
        curFile = getter.open_h5_file_read(fil)
        tid = fil.split('/')[-1].split('.')[0]
        curArtist = getter.get_artist_name(curFile)
        curTitle = getter.get_title(curFile)
        curArr = np.array([tid, curArtist, curTitle])
        infoList = np.vstack([infoList, curArr])
        curFile.close()

    return infoList
Пример #34
0
def songinfo(if_str):
	songs_tracks = pickle.load(open ("../../msd_dense_subset/dense/songs_tracks.pkl",'r'));
	track = str(songs_tracks[if_str])
	# build path
	path = "../../msd_dense_subset/dense/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5"
	h5 = GETTERS.open_h5_file_read(path)
	artist_name = GETTERS.get_artist_name(h5)
	song_name = GETTERS.get_title(h5)
	year = GETTERS.get_year(h5, 0)
	#segments = GETTERS.get_segments_start(h5, 0);
	#segments_pitches = GETTERS.get_segments_pitches(h5, 0)
	h5.close()
	return artist_name+ " - " +song_name + " (" +str(year) +")"
Пример #35
0
def _extractSongData(file_path, filename):
    # song_id, title, release, artist_name, year
    h5 = hdf5_getters.open_h5_file_read(file_path)
    track_id = filename[:-3]
    song_id = hdf5_getters.get_song_id(h5).decode('UTF-8')
    dig7_id = hdf5_getters.get_track_7digitalid(h5)
    title = hdf5_getters.get_title(h5).decode('UTF-8')
    release = hdf5_getters.get_release(h5).decode('UTF-8')
    artist_name = hdf5_getters.get_artist_name(h5).decode('UTF-8')
    year = hdf5_getters.get_year(h5)
    h5.close()
    # print(song_id, track_id, dig7_id, title, release, artist_name, year)
    return track_id, song_id, dig7_id, title, release, artist_name, year
Пример #36
0
def getArtistNameAndSongName(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    artist_name = GETTERS.get_artist_name(h5)
    song_name = GETTERS.get_title(h5)
    song_id = GETTERS.get_song_id(h5)
    songsAndArtists[str(song_id)] = tuple((artist_name, song_name))
    h5.close()
def getArtistNameAndSongName(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    artist_name = GETTERS.get_artist_name(h5)
    song_name = GETTERS.get_title(h5)
    song_id = GETTERS.get_song_id(h5)
    songsAndArtists[str(song_id)] = tuple((artist_name,song_name))
    h5.close()
Пример #38
0
def getInfo(files):
    infoList = np.array(['tid', 'artist', 'song'])

    for fil in files:
        curFile = getter.open_h5_file_read(fil)
        tid = fil.split('/')[-1].split('.')[0]
        curArtist = getter.get_artist_name(curFile)
        curTitle = getter.get_title(curFile)
        curArr = np.array([tid, curArtist, curTitle])
        infoList = np.vstack([infoList, curArr])
        curFile.close()

    return infoList
Пример #39
0
def func_to_get_desired_values(filename, returnValue = False):
    """
    This function does 3 simple things:
    - open the song file
    - get the elements we want and put them in
    - close the file
    INPUT : 
    filename    - The name of the h5 file to be loaded
    """
    global all_desired_data
    # Open file
    h5 = GETTERS.open_h5_file_read(filename)

    # Create and fill a record
    record = []
    for element in elementsRequested:
        result = getattr(GETTERS, element)(h5)
        try:
            if result == '':
                result = 'Adlen - void'
        except:
            pass
        try:
            if isinstance(result, np.ndarray):
                if len(result) > 1:
                    result = float(np.mean(result))
                else:
                    result = ''
        except:
            try:
                result = float(result)
            except:
                pass
        record.append(result)

    song_id = GETTERS.get_track_id(h5)
    artist_name = GETTERS.get_artist_name(h5)
    title = GETTERS.get_title(h5)
    artist_mbtags = GETTERS.get_artist_mbtags(h5)
    release = GETTERS.get_release(h5)

    song_id = unicode(song_id.decode('utf-8'))
    title = unicode(title.decode('utf-8'))
    artist_name = unicode(artist_name.decode('utf-8'))
    if not returnValue:
        all_desired_data.append([[[song_id, title, artist_name, elementsRequested], artist_name, title, artist_mbtags, release], record])
    
    h5.close()
    
    if returnValue:
        return [[[song_id, title, artist_name, elementsRequested], artist_name, title, artist_mbtags, release], record]
Пример #40
0
def songinfo(if_str):
    songs_tracks = pickle.load(
        open("../../msd_dense_subset/dense/songs_tracks.pkl", 'r'))
    track = str(songs_tracks[if_str])
    # build path
    path = "../../msd_dense_subset/dense/" + track[2] + "/" + track[
        3] + "/" + track[4] + "/" + track + ".h5"
    h5 = GETTERS.open_h5_file_read(path)
    artist_name = GETTERS.get_artist_name(h5)
    song_name = GETTERS.get_title(h5)
    year = GETTERS.get_year(h5, 0)
    #segments = GETTERS.get_segments_start(h5, 0);
    #segments_pitches = GETTERS.get_segments_pitches(h5, 0)
    h5.close()
    return artist_name + " - " + song_name + " (" + str(year) + ")"
Пример #41
0
def MSD_sample_dirlist_save(path,file_path):
    """
    get list of filenames, artist, song title for all h5 files in an MSD sample directory and save to csv
    """
    import csv
    dirpath = path
    dirlist = os.listdir(path)
    dirdata = []
    for fname in dirlist:
            h5 = GETTERS.open_h5_file_read(dirpath+fname)
            dirdata.append([dirpath, fname, GETTERS.get_artist_name(h5),GETTERS.get_title(h5)])
            h5.close()
    listwriter = csv.writer(open(file_path,'a'), delimiter=',',quotechar='|',quoting=csv.QUOTE_MINIMAL)
    listwriter.writerows(dirdata)
    return dirdata
Пример #42
0
def dump_pitches(rootdir = './data/', filename = 'pitches.p'):
    pitches = []
    tags = []

    for subdir, dirs, files in os.walk(rootdir):
        for f in files:
            if f.lower().endswith('.h5'):
                h5f = hdf5_getters.open_h5_file_read(os.path.join(subdir, f))
                seg_ptcs = hdf5_getters.get_segments_pitches(h5f)
                if 500 <= seg_ptcs.shape[0] <= 1500:
                    pitches.append(seg_ptcs)
                    tags.append('%s - %s - %s - %s' % (hdf5_getters.get_artist_name(h5f), hdf5_getters.get_title(h5f), hdf5_getters.get_year(h5f), hdf5_getters.get_tempo(h5f)))
                h5f.close()

    pickle.dump( pitches, open( filename, 'wb' ) )
    pickle.dump( tags, open( 'tags.p', 'wb' ) )
    print 'Saved {} pitches.'.format(len(pitches))
Пример #43
0
def map_artists_for_users():
    users_artists = dict()
    songs_tracks = pickle.load(open ("../msd_dense_subset/dense/songs_tracks.pkl",'r'));
    for user in users_songs:
        print user
        users_artists[user] = set()
        for song in users_songs[user]:
            track = str(songs_tracks[song])
            # build path
            path = "../msd_dense_subset/dense/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5"
            h5 = GETTERS.open_h5_file_read(path)
            users_artists[user].add(GETTERS.get_artist_name(h5))
            h5.close()
    #store in pickle file for the moment
    with open(USERS_ARTIST_FILE, 'w') as f:
        pickle.dump(users_artists, f, pickle.HIGHEST_PROTOCOL)
        print "data saved to %s" % USERS_ARTIST_FILE
def get_url(h5_file):
    artist_name = GETTERS.get_artist_name(h5_file)
    track_name = GETTERS.get_title(h5_file)
    echo_nest_id = GETTERS.get_track_id(h5_file).lower()

    if echo_nest_id >= 0:
        preview = get_preview_from_trackid(echo_nest_id)
        if preview != '':
            return preview

    res = get_trackid_from_text_search(track_name, artistname=artist_name)
    if len(res) > 0:
        closest_track = get_closest_track(res, track_name)
        preview = get_preview_from_trackid(closest_track['id'])
        return preview

    return None
Пример #45
0
def process_song(h5_song_file):
	song = {}
	song['artist_familiarity'] = hdf5_getters.get_artist_familiarity(h5)
	song['artist_id'] = hdf5_getters.get_artist_id(h5)
	song['artist_name'] = hdf5_getters.get_artist_name(h5)
	song['artist_hotttnesss'] = hdf5_getters.get_artist_hotttnesss(h5);
	song['title'] = hdf5_getters.get_title(h5)
	terms = hdf5_getters.get_artist_terms(h5)
	terms_freq = hdf5_getters.get_artist_terms_freq(h5)
	terms_weight = hdf5_getters.get_artist_terms_weight(h5)
	terms_array = []
	# Creating a array of [term, its frequency, its weight]. Doing this for all terms associated
	# with the artist
	for i in range(len(terms)):
		terms_array.append([terms[i], terms_freq[i], terms_weight[i]])	
		
	song['artist_terms'] = terms_array
	beats_start = hdf5_getters.get_beats_start(h5)
	song['beats_start_variance'] = variance(beats_start)   #beats variance in yocto seconds(10^-24s)
	song['number_of_beats'] = len(beats_start)
	song['duration'] = hdf5_getters.get_duration(h5)
	song['loudness'] = hdf5_getters.get_loudness(h5)
	sections_start = hdf5_getters.get_sections_start(h5)
	song['sections_start_variance'] = variance(sections_start)
	song['number_of_sections'] = len(sections_start)
	
	segments_pitches = hdf5_getters.get_segments_pitches(h5)
	(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) = split_segments(segments_pitches)
	song['segments_pitches_variance'] = [variance(a0), variance(a1), variance(a2),
					variance(a3), variance(a4), variance(a5), variance(a6), variance(a7),
					variance(a8), variance(a9), variance(a10), variance(a11)]
	song['segments_pitches_mean'] = [mean(a0), mean(a1), mean(a2), mean(a3), mean(a4), 
					mean(a5), mean(a6), mean(a7), mean(a8), mean(a9), mean(a10), mean(a11)]
	
	segments_timbre = hdf5_getters.get_segments_timbre(h5)
	(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) = split_segments(segments_timbre)
	song['segments_timbre_variance'] = [variance(a0), variance(a1), variance(a2),
					variance(a3), variance(a4), variance(a5), variance(a6), variance(a7),
					variance(a8), variance(a9), variance(a10), variance(a11)]
	song['segments_timbre_mean'] = [mean(a0), mean(a1), mean(a2), mean(a3), mean(a4), 
					mean(a5), mean(a6), mean(a7), mean(a8), mean(a9), mean(a10), mean(a11)]
	song['tempo'] = hdf5_getters.get_tempo(h5)
	song['_id'] = hdf5_getters.get_song_id(h5)
	song['year'] = hdf5_getters.get_year(h5)	
	return song
def load_raw_data():
    years = []
    ten_features=[]
    timbres = []
    pitches = []
    min_length = 10000
    num = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            h5 = getter.open_h5_file_read(f)
            num += 1
            print(num)
            try:
                year = getter.get_year(h5)
                if year!=0:
                    timbre = getter.get_segments_timbre(h5)
                    s = np.size(timbre,0)
                    if s>=100:
                        if s<min_length:
                            min_length = s
                        pitch = getter.get_segments_pitches(h5)
                        years.append(year)
                        timbres.append(timbre)
                        pitches.append(pitch)
                        title_length = len(getter.get_title(h5))
                        terms_length = len(getter.get_artist_terms(h5))
                        tags_length = len(getter.get_artist_mbtags(h5))
                        hotness = getter.get_artist_hotttnesss(h5)
                        duration = getter.get_duration(h5)
                        loudness = getter.get_loudness(h5)
                        mode = getter.get_mode(h5)
                        release_length = len(getter.get_release(h5))
                        tempo = getter.get_tempo(h5)
                        name_length = len(getter.get_artist_name(h5))
                        ten_feature = np.hstack([title_length, hotness, duration, tags_length,
                                                 terms_length,loudness, mode, release_length, tempo, name_length])

                        ten_features.append(ten_feature) 
            except:
                print(1)
            h5.close()
    return years, timbres, pitches,min_length,ten_features
Пример #47
0
def getInfo(files, genres, songs, topicNum):
    # Checks to see db song is in out subset, then adds it
    # Not the most efficient method
    infoList = np.zeros(topicNum + 4)
    for fil in files:
        for song in songs:
            if fil.split('/')[-1].split('.')[0] == song[1].split(
                    '/')[-1].split('.')[0]:
                curFile = getter.open_h5_file_read(fil)
                tid = fil.split('/')[-1].split('.')[0]
                curArtist = getter.get_artist_name(curFile)
                curTitle = getter.get_title(curFile)
                curArr = np.array([tid, curArtist, curTitle])
                infoList = np.vstack(
                    [infoList,
                     np.hstack([curArr, genres[tid], song[2:]])])
                curFile.close()

    return infoList[1:]
Пример #48
0
def map_artists_for_users():
    users_artists = dict()
    songs_tracks = pickle.load(
        open("../msd_dense_subset/dense/songs_tracks.pkl", 'r'))
    for user in users_songs:
        print user
        users_artists[user] = set()
        for song in users_songs[user]:
            track = str(songs_tracks[song])
            # build path
            path = "../msd_dense_subset/dense/" + track[2] + "/" + track[
                3] + "/" + track[4] + "/" + track + ".h5"
            h5 = GETTERS.open_h5_file_read(path)
            users_artists[user].add(GETTERS.get_artist_name(h5))
            h5.close()
    #store in pickle file for the moment
    with open(USERS_ARTIST_FILE, 'w') as f:
        pickle.dump(users_artists, f, pickle.HIGHEST_PROTOCOL)
        print "data saved to %s" % USERS_ARTIST_FILE
Пример #49
0
def h5_to_csv_fields(h5,song):
	'''Converts h5 format to text
		Inputs: h5, an h5 file object, usable with the wrapper code MSongsDB
			song, an integer, representing which song in the h5 file to take the info out of (h5 files contain many songs)
		Output: a string representing all the information of this song, as a single line of a csv file
	'''
	rv=[]
	##All these are regular getter functions from wrapper code
	rv.append(gt.get_artist_name(h5,song))
	rv.append(gt.get_title(h5, song))
	rv.append(gt.get_release(h5, song))
	rv.append(gt.get_year(h5,song))
	rv.append(gt.get_duration(h5,song))
	rv.append(gt.get_artist_familiarity(h5,song))
	rv.append(gt.get_artist_hotttnesss(h5,song))
	rv.append(gt.get_song_hotttnesss(h5, song))
	
	##artist_terms, artist_terms_freq, and artist_terms_weight getter functions
	##are all arrays, so we need to turn them into strings first. We used '_' as a separator
	rv.append(array_to_csv_field(list(gt.get_artist_terms(h5,song))))
	rv.append(array_to_csv_field(list(gt.get_artist_terms_freq(h5,song))))
	rv.append(array_to_csv_field(list(gt.get_artist_terms_weight(h5,song))))
	rv.append(gt.get_mode(h5,song))
	rv.append(gt.get_key(h5,song))
	rv.append(gt.get_tempo(h5,song))
	rv.append(gt.get_loudness(h5,song))
	rv.append(gt.get_danceability(h5,song))
	rv.append(gt.get_energy(h5,song))
	rv.append(gt.get_time_signature(h5,song))
	rv.append(array_to_csv_field(list(gt.get_segments_start(h5,song))))
	##These arrays have vectors (Arrays) as items, 12 dimensional each
	##An array like [[1,2,3],[4,5,6]] will be written to csv as '1;2;3_4;5;6', i.e. there's two types of separators
	rv.append(double_Array_to_csv_field(list(gt.get_segments_timbre(h5,song)),'_',';'))
	rv.append(double_Array_to_csv_field(list(gt.get_segments_pitches(h5,song)),'_',';'))
	rv.append(array_to_csv_field(list(gt.get_segments_loudness_start(h5,song))))
	rv.append(array_to_csv_field(list(gt.get_segments_loudness_max(h5,song))))
	rv.append(array_to_csv_field(list(gt.get_segments_loudness_max_time(h5,song))))
	rv.append(array_to_csv_field(list(gt.get_sections_start(h5,song))))
	##turn this list into a string with comma separators (i.e. a csv line)
	rv_string=array_to_csv_field(rv, ",")
	rv_string+="\n"
	return rv_string
Пример #50
0
def getTrackInfo(starting_num):
    my_list = []
    f = hdf5_getters.open_h5_file_read(filepath)
    progress_bar = tqdm(range(tracks_per_thread))
    for iteration in progress_bar:
        i = int(iteration) + (starting_num*tracks_per_thread)
        track_id = hdf5_getters.get_track_id(f, i).decode()
        if track_id not in lyric_track_ids_set:
            continue # skip it an go on
        artist_name = hdf5_getters.get_artist_name(f, i).decode()
        duration = hdf5_getters.get_duration(f, i)
        loudness = hdf5_getters.get_loudness(f, i)
        tempo = hdf5_getters.get_tempo(f, i)
        title = hdf5_getters.get_title(f, i).decode()
        year = hdf5_getters.get_year(f, i)
        long_list = [track_id, artist_name, duration, loudness, tempo, title, year]
        my_list.append(long_list)
        progress_bar.set_description("Iteration %d" % i)
    f.close()
    return my_list
def get_all_data(target, basedir, ext='.h5') :

    # header
    target.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (
                 "track_id", "song_id", "title", "artist_name", "artist_location",
                 "artist_hotttnesss", "release", "year", "song_hotttnesss",
                 "danceability", "duration", "loudness", "sample_rate", "tempo"
    ))

    count = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            for line in f:
                new_file = open("tmp.txt", 'w')
                new_file.write(line)

                h5 = hdf5_getters.open_h5_file_read(new_file)
                target.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (
                              hdf5_getters.get_track_id(h5),
                              hdf5_getters.get_song_id(h5),
                              hdf5_getters.get_title(h5),
                              hdf5_getters.get_artist_name(h5),
                              hdf5_getters.get_artist_location(h5),
                              hdf5_getters.get_artist_hotttnesss(h5),
                              hdf5_getters.get_release(h5),
                              hdf5_getters.get_year(h5),
                              hdf5_getters.get_song_hotttnesss(h5),
                              hdf5_getters.get_danceability(h5),
                              hdf5_getters.get_duration(h5),
                              hdf5_getters.get_loudness(h5),
                              hdf5_getters.get_analysis_sample_rate(h5),
                              hdf5_getters.get_tempo(h5)
                ))

                # show progress
                count += 1
                print "%d/10000" % (count)

                h5.close()
Пример #52
0
def get_all_attributes(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get all required attributes
    - write it to a csv file 
    - close the files
    """
    with open('attributes.csv', 'a') as csvfile:
        try:
            # let's apply the previous function to all files
            csvwriter = csv.writer(csvfile, delimiter='\t')
            h5 = GETTERS.open_h5_file_read(filename)
            RESULTS = []
            RESULTS.append(GETTERS.get_year(h5))
            RESULTS.append(GETTERS.get_artist_id(h5))
            RESULTS.append(GETTERS.get_artist_name(h5))
            RESULTS.append(GETTERS.get_artist_mbid(h5))
            RESULTS.append(convert_terms(GETTERS.get_artist_terms(h5)))
            RESULTS.append(GETTERS.get_artist_hotttnesss(h5))
            RESULTS.append(GETTERS.get_artist_latitude(h5))
            RESULTS.append(GETTERS.get_artist_longitude(h5))
            RESULTS.append(GETTERS.get_artist_familiarity(h5))
            RESULTS.append(GETTERS.get_danceability(h5))
            RESULTS.append(GETTERS.get_duration(h5))
            RESULTS.append(GETTERS.get_energy(h5))
            RESULTS.append(GETTERS.get_loudness(h5))
            RESULTS.append(GETTERS.get_song_hotttnesss(h5))
            RESULTS.append(GETTERS.get_song_id(h5))
            RESULTS.append(GETTERS.get_tempo(h5))
            RESULTS.append(GETTERS.get_time_signature(h5))
            RESULTS.append(GETTERS.get_title(h5))
            RESULTS.append(GETTERS.get_track_id(h5))
            RESULTS.append(GETTERS.get_release(h5))
            csvwriter.writerow(RESULTS)
            h5.close()
        except AttributeError:
            pass
Пример #53
0
def getURLFromH5(h5path):
    if not os.path.isfile(h5path):
        print 'invalid path (not a file):',h5path
        sys.exit(0)
    h5 = hdf5_utils.open_h5_file_read(h5path)
    track_7digitalid = GETTERS.get_track_7digitalid(h5)
    release_7digitalid = GETTERS.get_release_7digitalid(h5)
    artist_7digitalid = GETTERS.get_artist_7digitalid(h5)
    artist_name = GETTERS.get_artist_name(h5)
    release_name = GETTERS.get_release(h5)
    track_name = GETTERS.get_title(h5)
    h5.close()

    # we already have the 7digital track id? way too easy!
    print "Suggested Song URLs For you"
    print "==========================="
    if track_7digitalid >= 0:
        preview = get_preview_from_trackid(track_7digitalid)
        if preview == '':
            print 'something went wrong when looking by track id'
        else:
	    print preview
            return preview
            sys.exit(0)
def func_to_extract_features(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    global cntnan	
    global cntdanceability
    global listfeatures

    global listhotness
    global listyear
    global listloudness
    global listkey
    global listmode
    global listduration 

    cf = []
    h5 = GETTERS.open_h5_file_read(filename)
    nanfound = 0

    #Get target feature: song hotness

    #FEATURE 0
    song_hotness = GETTERS.get_song_hotttnesss(h5)
    if math.isnan(song_hotness):
       nanfound = 1
       cntnan = cntnan + 1
    else:
       cf.append(song_hotness)

    #FEATURE 1
    #Get song loudness
    song_loudness = GETTERS.get_loudness(h5)
    
    if math.isnan(song_loudness):
       nanfound = 1
       cntnan = cntnan + 1
    else:
       cf.append(song_loudness)

    #FEATURE 2
    #Get key of the song
    song_key = GETTERS.get_key(h5)
    if math.isnan(song_key):
       nanfound = 1
       cntnan = cntnan + 1
    else:
       cf.append(song_key)

    #FEATURE 3
    #Get duration of the song
    song_duration = GETTERS.get_duration(h5)
    if math.isnan(song_duration):
       nanfound = 1
       cntnan = cntnan + 1
    else:
       cf.append(song_duration)

    #FEATURE 4-15
    #Get Average Pitch Class across all segments
    #Get the pitches (12 pitches histogram for each segment)
    pitches = GETTERS.get_segments_pitches(h5)
    M = np.mat(pitches)
    meanpitches = M.mean(axis=0)
    pitches_arr = np.asarray(meanpitches)
    pitches_list = []
    for i in range(0,12):
	pitches_list.append(pitches_arr[0][i])

    cf.append(pitches_list)

    #FEATURE 16, 27
    #Get Average Timbre Class across all segments
    timbres = GETTERS.get_segments_timbre(h5)
    M = np.mat(timbres)
    meantimbres = M.mean(axis=0)
    timbre_arr = np.asarray(meantimbres)
    timbre_list = []
    for i in range(0,12):
	timbre_list.append(timbre_arr[0][i])

    cf.append(timbre_list)

    #FEATURE 28 
    #Get song year
    song_year = GETTERS.get_year(h5)
    if song_year == 0:
       nanfound = 1
       cntnan = cntnan + 1
    else:
      cf.append(song_year)

    #FEATURE 29 
    #Get song tempo
    song_tempo = GETTERS.get_tempo(h5)
    cf.append(song_tempo)

    #Feature 30
    #Get max loudness for each segment
    max_loudness_arr = GETTERS.get_segments_loudness_max(h5)
    start_loudness_arr = GETTERS.get_segments_loudness_start(h5)
    if nanfound == 0:
       cf.append(max(max_loudness_arr)-min(start_loudness_arr))

    #Feature 31
    artist_familiarity = GETTERS.get_artist_familiarity(h5)
    cf.append(artist_familiarity)

    #Feature 32
    song_title = GETTERS.get_title(h5)
    cf.append(song_title)

    #Featture 33
    artist_name = GETTERS.get_artist_name(h5)
    cf.append(artist_name)

    #Feature 34
    #location = GETTERS.get_artist_location(h5)
    #cf.append(location)

    #Tags
    artist_mbtags = GETTERS.get_artist_mbtags(h5)
    if not artist_mbtags.size:
       genre = "Unknown"
    else:
       artist_mbcount = np.array(GETTERS.get_artist_mbtags_count(h5))
       index_max = artist_mbcount.argmax(axis=0)
       genre = artist_mbtags[index_max]
       if genre == 'espa\xc3\xb1ol':
	  genre = "Unknown"

       cf.append(genre)

    if nanfound == 0:
       strlist = list_to_csv(cf)
       listfeatures.append(strlist)
       mydict.setdefault(artist_name,[]).append(song_hotness)
    h5.close()
Пример #55
0
db = MySQLdb.connect(host="localhost",user="******",passwd="password",db="FinalProject")
db.query("DELETE FROM artist WHERE artist_id = 'a';")
cursor = db.cursor(MySQLdb.cursors.DictCursor)

counter = 0
for subdir, dirs, files in os.walk("data/"):
    for file in files:
        f = os.path.join(subdir, file)
        if ".h5" in f:
            h5 = h.open_h5_file_read(f)
            print ("----------")
            
            ''' Store artist tuples '''
            artist_id = h.get_artist_id(h5,0)
            artist_name = h.get_artist_name(h5,0)
            artist_name = artist_name.replace("'","")
            artist_hottness = str(h.get_artist_hotttnesss(h5,0))
            print artist_hottness
            if artist_hottness == "nan":
                artist_hottness = "0.0"
            artist_familiarity = str(h.get_artist_familiarity(h5,0))
            if artist_familiarity == "nan":
                artist_familiarity = "0.0"
            cursor.execute("SELECT * FROM artist WHERE artist_id = '" + artist_id  + "'")
            rs = cursor.fetchall()
            if cursor.rowcount != 1:
                cursor.execute("INSERT INTO artist VALUES ('" + artist_id + "','" + artist_name  + "'," + artist_hottness + "," + artist_familiarity + ");")
            
            ''' Store artist_genres tuples '''            
            terms = h.get_artist_terms(h5,0)
Пример #56
0
def classify(h5):
	output_array={}
	# duration
	duration=hdf5_getters.get_duration(h5)
	output_array["duration"]=duration	### ADDED VALUE TO ARRAY
	# number of bars
	bars=hdf5_getters.get_bars_start(h5)
	num_bars=len(bars)
	output_array["num_bars"]=num_bars	### ADDED VALUE TO ARRAY
	# mean and variance in bar length
	bar_length=numpy.ediff1d(bars)
	variance_bar_length=numpy.var(bar_length)
	output_array["variance_bar_length"]=variance_bar_length	### ADDED VALUE TO ARRAY
	# number of beats
	beats=hdf5_getters.get_beats_start(h5)
	num_beats=len(beats)
	output_array["num_beats"]=num_beats	### ADDED VALUE TO ARRAY
	# mean and variance in beats length
	beats_length=numpy.ediff1d(beats)
	variance_beats_length=numpy.var(bar_length)
	output_array["variance_beats_length"]=variance_beats_length	### ADDED VALUE TO ARRAY
	# danceability
	danceability=hdf5_getters.get_danceability(h5)
	output_array["danceability"]=danceability	### ADDED VALUE TO ARRAY
	# end of fade in
	end_of_fade_in=hdf5_getters.get_end_of_fade_in(h5)
	output_array["end_of_fade_in"]=end_of_fade_in	### ADDED VALUE TO ARRAY
	# energy
	energy=hdf5_getters.get_energy(h5)
	output_array["energy"]=energy	### ADDED VALUE TO ARRAY
	# key
	key=hdf5_getters.get_key(h5)
	output_array["key"]=int(key)	### ADDED VALUE TO ARRAY
	# loudness
	loudness=hdf5_getters.get_loudness(h5)
	output_array["loudness"]=loudness	### ADDED VALUE TO ARRAY
	# mode
	mode=hdf5_getters.get_mode(h5)
	output_array["mode"]=int(mode)	### ADDED VALUE TO ARRAY
	# number sections
	sections=hdf5_getters.get_sections_start(h5)
	num_sections=len(sections)
	output_array["num_sections"]=num_sections	### ADDED VALUE TO ARRAY
	# mean and variance in sections length
	sections_length=numpy.ediff1d(sections)
	variance_sections_length=numpy.var(sections)
	output_array["variance_sections_length"]=variance_sections_length	### ADDED VALUE TO ARRAY
	# number segments
	segments=hdf5_getters.get_segments_start(h5)
	num_segments=len(segments)
	output_array["num_segments"]=num_segments	### ADDED VALUE TO ARRAY
	# mean and variance in segments length
	segments_length=numpy.ediff1d(segments)
	variance_segments_length=numpy.var(segments)
	output_array["variance_segments_length"]=variance_segments_length	### ADDED VALUE TO ARRAY
	# segment loudness max
	segment_loudness_max_array=hdf5_getters.get_segments_loudness_max(h5)
	segment_loudness_max_time_array=hdf5_getters.get_segments_loudness_max_time(h5)
	segment_loudness_max_index=0
	for i in range(len(segment_loudness_max_array)):
		if segment_loudness_max_array[i]>segment_loudness_max_array[segment_loudness_max_index]:
			segment_loudness_max_index=i
	segment_loudness_max=segment_loudness_max_array[segment_loudness_max_index]
	segment_loudness_max_time=segment_loudness_max_time_array[segment_loudness_max_index]
	output_array["segment_loudness_max"]=segment_loudness_max	### ADDED VALUE TO ARRAY
	output_array["segment_loudness_time"]=segment_loudness_max_time	### ADDED VALUE TO ARRAY
			
	# POSSIBLE TODO: use average function instead and weight by segment length
	# segment loudness mean (start)
	segment_loudness_array=hdf5_getters.get_segments_loudness_start(h5)
	segment_loudness_mean=numpy.mean(segment_loudness_array)
	output_array["segment_loudness_mean"]=segment_loudness_mean	### ADDED VALUE TO ARRAY
	# segment loudness variance (start)
	segment_loudness_variance=numpy.var(segment_loudness_array)
	output_array["segment_loudness_variance"]=segment_loudness_variance	### ADDED VALUE TO ARRAY
	# segment pitches
	segment_pitches_array=hdf5_getters.get_segments_pitches(h5)
	segment_pitches_mean=numpy.mean(segment_pitches_array,axis=0).tolist()
	output_array["segment_pitches_mean"]=segment_pitches_mean
	# segment pitches variance (start)
	segment_pitches_variance=numpy.var(segment_pitches_array,axis=0).tolist()
	output_array["segment_pitches_variance"]=segment_pitches_variance
	# segment timbres
	segment_timbres_array=hdf5_getters.get_segments_timbre(h5)
	segment_timbres_mean=numpy.mean(segment_timbres_array,axis=0).tolist()
	output_array["segment_timbres_mean"]=segment_timbres_mean
	# segment timbres variance (start)
	segment_timbres_variance=numpy.var(segment_timbres_array,axis=0).tolist()
	output_array["segment_timbres_variance"]=segment_timbres_variance
	# hotttnesss
	hottness=hdf5_getters.get_song_hotttnesss(h5,0)
	output_array["hottness"]=hottness	### ADDED VALUE TO ARRAY
	# duration-start of fade out
	start_of_fade_out=hdf5_getters.get_start_of_fade_out(h5)
	fade_out=duration-start_of_fade_out
	output_array["fade_out"]=fade_out	### ADDED VALUE TO ARRAY
	# tatums
	tatums=hdf5_getters.get_tatums_start(h5)
	num_tatums=len(tatums)
	output_array["num_tatums"]=num_tatums	### ADDED VALUE TO ARRAY
	# mean and variance in tatums length
	tatums_length=numpy.ediff1d(tatums)
	variance_tatums_length=numpy.var(tatums_length)
	output_array["variance_tatums_length"]=variance_tatums_length	### ADDED VALUE TO ARRAY
	# tempo
	tempo=hdf5_getters.get_tempo(h5)
	output_array["tempo"]=tempo	### ADDED VALUE TO ARRAY
	# time signature
	time_signature=hdf5_getters.get_time_signature(h5)
	output_array["time_signature"]=int(time_signature)	### ADDED VALUE TO ARRAY
	# year
	year=hdf5_getters.get_year(h5)
	output_array["year"]=int(year)	### ADDED VALUE TO ARRAY
	# artist terms
	artist_terms=hdf5_getters.get_artist_terms(h5,0)
	output_array["artist_terms"]=artist_terms.tolist()
	artist_terms_freq=hdf5_getters.get_artist_terms_freq(h5,0)
	output_array["artist_terms_freq"]=artist_terms_freq.tolist()
	artist_name=hdf5_getters.get_artist_name(h5,0)
	output_array["artist_name"]=artist_name
	artist_id=hdf5_getters.get_artist_id(h5,0)
	output_array["artist_id"]=artist_id
	# title
	title=hdf5_getters.get_title(h5,0)
	output_array["title"]=title

	return output_array
Пример #57
0
	#print 
	track = {}
	#Handle each one
	year = h5get.get_year(h5, i)
	if year < 1980 or year > 2010:
		continue;

	song = Song()
	#song.year = year
	#song.hotness = h5get.get_song_hotttnesss(h5, i)

	#print "Hotness: ", song.hotness;
	#if math.isnan(song.hotness):
	#	song.hotness = 0.0;

	song.artist = h5get.get_artist_name(h5, i)
	song.name = h5get.get_title(h5, i)
	#track['track'] = str(song.artist) + " " + str(song.name)
	#track['hotness'] = float(song.hotness)
	track['artist'] = song.artist
	track['name'] = song.name
	song_list.append(track)
	#song.pop_score = calc_poffpop(song)
	#print "Poff Score", song.pop_score
	#all_songs.append(song)
	#print all_songs

json.dump(song_list,w)
w.close()
"""
def data_to_flat_file(basedir,ext='.h5') :
    """This function extract the information from the tables and creates the flat file."""	
    count = 0;	#song counter
    list_to_write= []
    row_to_write = ""
    writer = csv.writer(open("metadata_wholeA.csv", "wb"))
    for root, dirs, files in os.walk(basedir):
	files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
	    print f	#the name of the file
            h5 = hdf5_getters.open_h5_file_read(f)
	    title = hdf5_getters.get_title(h5) 
	    title= title.replace('"','') 
	    comma=title.find(',')	#eliminating commas in the title
	    if	comma != -1:
		    print title
		    time.sleep(1)
	    album = hdf5_getters.get_release(h5)
	    album= album.replace('"','')	#eliminating commas in the album	
	    comma=album.find(',')
	    if	comma != -1:
		    print album
		    time.sleep(1)
	    artist_name = hdf5_getters.get_artist_name(h5)
	    comma=artist_name.find(',')
	    if	comma != -1:
		    print artist_name
		    time.sleep(1)
	    artist_name= artist_name.replace('"','')	#eliminating double quotes
	    duration = hdf5_getters.get_duration(h5)
	    samp_rt = hdf5_getters.get_analysis_sample_rate(h5)
	    artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5)
	    artist_fam = hdf5_getters.get_artist_familiarity(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_fam) == True:
	            artist_fam=-1
	    artist_hotness= hdf5_getters.get_artist_hotttnesss(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_hotness) == True:
	            artist_hotness=-1
	    artist_id = hdf5_getters.get_artist_id(h5)
	    artist_lat = hdf5_getters.get_artist_latitude(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_lat) == True:
	            artist_lat=-1
	    artist_loc = hdf5_getters.get_artist_location(h5)
		#checks artist_loc to see if it is a hyperlink if it is set as empty string
	    artist_loc = artist_loc.replace(",", "\,");
	    if artist_loc.startswith("<a"):
                artist_loc = ""
	    if len(artist_loc) > 100:
                artist_loc = ""
	    artist_lon = hdf5_getters.get_artist_longitude(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_lon) == True:
	            artist_lon=-1
	    artist_mbid = hdf5_getters.get_artist_mbid(h5)
	    artist_pmid = hdf5_getters.get_artist_playmeid(h5)
	    audio_md5 = hdf5_getters.get_audio_md5(h5)
	    danceability = hdf5_getters.get_danceability(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(danceability) == True:
	            danceability=-1
	    end_fade_in =hdf5_getters.get_end_of_fade_in(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(end_fade_in) == True:
	            end_fade_in=-1
	    energy = hdf5_getters.get_energy(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(energy) == True:
	            energy=-1
            song_key = hdf5_getters.get_key(h5)
	    key_c = hdf5_getters.get_key_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(key_c) == True:
	            key_c=-1
	    loudness = hdf5_getters.get_loudness(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(loudness) == True:
	            loudness=-1
	    mode = hdf5_getters.get_mode(h5)
	    mode_conf = hdf5_getters.get_mode_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(mode_conf) == True:
	            mode_conf=-1
	    release_7digitalid = hdf5_getters.get_release_7digitalid(h5)
	    song_hot = hdf5_getters.get_song_hotttnesss(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(song_hot) == True:
	            song_hot=-1
	    song_id = hdf5_getters.get_song_id(h5)
	    start_fade_out = hdf5_getters.get_start_of_fade_out(h5)
	    tempo = hdf5_getters.get_tempo(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(tempo) == True:
	            tempo=-1
	    time_sig = hdf5_getters.get_time_signature(h5)
	    time_sig_c = hdf5_getters.get_time_signature_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(time_sig_c) == True:
	            time_sig_c=-1
	    track_id = hdf5_getters.get_track_id(h5)
	    track_7digitalid = hdf5_getters.get_track_7digitalid(h5)
	    year = hdf5_getters.get_year(h5)
	    bars_c = hdf5_getters.get_bars_confidence(h5)
	    bars_c_avg= get_avg(bars_c)
	    bars_c_max= get_max(bars_c)
	    bars_c_min = get_min(bars_c)
	    bars_c_stddev= get_stddev(bars_c)
	    bars_c_count = get_count(bars_c)
	    bars_c_sum = get_sum(bars_c)
	    bars_start = hdf5_getters.get_bars_start(h5)
	    bars_start_avg = get_avg(bars_start)
	    bars_start_max= get_max(bars_start)
	    bars_start_min = get_min(bars_start)
	    bars_start_stddev= get_stddev(bars_start)
	    bars_start_count = get_count(bars_start)
	    bars_start_sum = get_sum(bars_start)
            beats_c = hdf5_getters.get_beats_confidence(h5)
            beats_c_avg= get_avg(beats_c)
	    beats_c_max= get_max(beats_c)
	    beats_c_min = get_min(beats_c)
	    beats_c_stddev= get_stddev(beats_c)
	    beats_c_count = get_count(beats_c)
	    beats_c_sum = get_sum(beats_c)
            beats_start = hdf5_getters.get_beats_start(h5)
 	    beats_start_avg = get_avg(beats_start)
	    beats_start_max= get_max(beats_start)
	    beats_start_min = get_min(beats_start)
	    beats_start_stddev= get_stddev(beats_start)
	    beats_start_count = get_count(beats_start)
	    beats_start_sum = get_sum(beats_start)
	    sec_c = hdf5_getters.get_sections_confidence(h5)
            sec_c_avg= get_avg(sec_c)
	    sec_c_max= get_max(sec_c)
	    sec_c_min = get_min(sec_c)
	    sec_c_stddev= get_stddev(sec_c)
	    sec_c_count = get_count(sec_c)
	    sec_c_sum = get_sum(sec_c)
	    sec_start = hdf5_getters.get_sections_start(h5)
            sec_start_avg = get_avg(sec_start)
	    sec_start_max= get_max(sec_start)
	    sec_start_min = get_min(sec_start)
	    sec_start_stddev= get_stddev(sec_start)
	    sec_start_count = get_count(sec_start)
	    sec_start_sum = get_sum(sec_start)
	    seg_c = hdf5_getters.get_segments_confidence(h5)
	    seg_c_avg= get_avg(seg_c)
	    seg_c_max= get_max(seg_c)
	    seg_c_min = get_min(seg_c)
	    seg_c_stddev= get_stddev(seg_c)
	    seg_c_count = get_count(seg_c)
	    seg_c_sum = get_sum(seg_c)
            seg_loud_max = hdf5_getters.get_segments_loudness_max(h5)
            seg_loud_max_avg= get_avg(seg_loud_max)
	    seg_loud_max_max= get_max(seg_loud_max)
	    seg_loud_max_min = get_min(seg_loud_max)
	    seg_loud_max_stddev= get_stddev(seg_loud_max)
	    seg_loud_max_count = get_count(seg_loud_max)
	    seg_loud_max_sum = get_sum(seg_loud_max)
	    seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5)
	    seg_loud_max_time_avg= get_avg(seg_loud_max_time)
	    seg_loud_max_time_max= get_max(seg_loud_max_time)
	    seg_loud_max_time_min = get_min(seg_loud_max_time)
	    seg_loud_max_time_stddev= get_stddev(seg_loud_max_time)
	    seg_loud_max_time_count = get_count(seg_loud_max_time)
	    seg_loud_max_time_sum = get_sum(seg_loud_max_time)
	    seg_loud_start = hdf5_getters.get_segments_loudness_start(h5)
	    seg_loud_start_avg= get_avg(seg_loud_start)
	    seg_loud_start_max= get_max(seg_loud_start)
	    seg_loud_start_min = get_min(seg_loud_start)
	    seg_loud_start_stddev= get_stddev(seg_loud_start)
	    seg_loud_start_count = get_count(seg_loud_start)
	    seg_loud_start_sum = get_sum(seg_loud_start)					      
	    seg_pitch = hdf5_getters.get_segments_pitches(h5)
	    pitch_size = len(seg_pitch)
	    seg_start = hdf5_getters.get_segments_start(h5)
	    seg_start_avg= get_avg(seg_start)
	    seg_start_max= get_max(seg_start)
	    seg_start_min = get_min(seg_start)
	    seg_start_stddev= get_stddev(seg_start)
	    seg_start_count = get_count(seg_start)
	    seg_start_sum = get_sum(seg_start)
	    seg_timbre = hdf5_getters.get_segments_timbre(h5)
	    tatms_c = hdf5_getters.get_tatums_confidence(h5)
	    tatms_c_avg= get_avg(tatms_c)
	    tatms_c_max= get_max(tatms_c)
	    tatms_c_min = get_min(tatms_c)
	    tatms_c_stddev= get_stddev(tatms_c)
	    tatms_c_count = get_count(tatms_c)
	    tatms_c_sum = get_sum(tatms_c)
	    tatms_start = hdf5_getters.get_tatums_start(h5)
	    tatms_start_avg= get_avg(tatms_start)
	    tatms_start_max= get_max(tatms_start)
	    tatms_start_min = get_min(tatms_start)
	    tatms_start_stddev= get_stddev(tatms_start)
	    tatms_start_count = get_count(tatms_start)
	    tatms_start_sum = get_sum(tatms_start)
	
	    #Getting the genres
	    genre_set = 0    #flag to see if the genre has been set or not
	    art_trm = hdf5_getters.get_artist_terms(h5)
	    trm_freq = hdf5_getters.get_artist_terms_freq(h5)
	    trn_wght = hdf5_getters.get_artist_terms_weight(h5)
	    a_mb_tags = hdf5_getters.get_artist_mbtags(h5)
	    genre_indexes=get_genre_indexes(trm_freq) #index of the highest freq
	    final_genre=[]
	    genres_so_far=[]
	    for i in range(len(genre_indexes)):
		    genre_tmp=get_genre(art_trm,genre_indexes[i])   #genre that corresponds to the highest freq
		    genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary
		    if len(genres_so_far) != 0:
			    for i in genres_so_far:
				final_genre.append(i)
				genre_set=1				#genre was found in dictionary
				  
		
	    
	    if genre_set == 1:
		    col_num=[]
		   
		    for genre in final_genre:
			    column=int(genre)				#getting the column number of the genre
			    col_num.append(column)

		    genre_array=genre_columns(col_num)	         #genre array
 	    else:
		    genre_array=genre_columns(-1)		#the genre was not found in the dictionary

	    transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows
	    #arrays containing the aggregate values of the 12 rows
	    seg_pitch_avg=[]
	    seg_pitch_max=[]
	    seg_pitch_min=[]
            seg_pitch_stddev=[]
            seg_pitch_count=[]
	    seg_pitch_sum=[]
            i=0
	    #Getting the aggregate values in the pitches array
	    for row in transpose_pitch:
		   seg_pitch_avg.append(get_avg(row))
		   seg_pitch_max.append(get_max(row))
	           seg_pitch_min.append(get_min(row))
		   seg_pitch_stddev.append(get_stddev(row))
		   seg_pitch_count.append(get_count(row))
                   seg_pitch_sum.append(get_sum(row))
		   i=i+1

	    #extracting information from the timbre array 
            transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows
	    #arrays containing the aggregate values of the 12 rows
	    seg_timbre_avg=[]
	    seg_timbre_max=[]
	    seg_timbre_min=[]
            seg_timbre_stddev=[]
            seg_timbre_count=[]
	    seg_timbre_sum=[]
            i=0
	    for row in transpose_timbre:
		   seg_timbre_avg.append(get_avg(row))
		   seg_timbre_max.append(get_max(row))
	           seg_timbre_min.append(get_min(row))
		   seg_timbre_stddev.append(get_stddev(row))
		   seg_timbre_count.append(get_count(row))
                   seg_timbre_sum.append(get_sum(row))
		   i=i+1
		


		#Writing to the flat file
            writer.writerow([title,album,artist_name,year,duration,seg_start_count, tempo])

	    h5.close()
	    count=count+1;
	    print count;
# This script converts the summary H5 files only 300MB to a csv file
# Run only on the Master Node since h5_getters cannot open a remote(ie. HDFS) file

if __name__ == "__main__":

    with open("fields.csv", "wb") as f:
        writer = csv.writer(f)  # initialize the csv writer

        # for each track in the summary file, get the 11 fields and output to csv
        h5_file = hdf5_getters.open_h5_file_read("msd_summary_file.h5")
        for k in range(1000000):
            print "index!!!: ", k
            id = hdf5_getters.get_track_id(h5_file, k)  # get track_id TRA13e39..
            title = hdf5_getters.get_title(h5_file, k)  # get song title
            artist_name = hdf5_getters.get_artist_name(h5_file, k)
            year = int(hdf5_getters.get_year(h5_file, k))
            hotness = float(hdf5_getters.get_song_hotttnesss(h5_file, k))
            artist_familiarity = float(hdf5_getters.get_artist_familiarity(h5_file, k))
            f5 = int(hdf5_getters.get_key(h5_file, k))  # get key
            f2 = float(hdf5_getters.get_loudness(h5_file, k))  # get loudness
            f1 = float(hdf5_getters.get_tempo(h5_file, k))  # get tempo
            f4 = int(hdf5_getters.get_duration(h5_file, k))  # get duration
            f3 = float(hdf5_getters.get_time_signature(h5_file, k))  # get time signature

            # Get rid of missing info and change invalid numbers for meta data

            if not artist_name:
                artist_name = "unknown"

            if not artist_familiarity:
Пример #60
0
def get_song_info(h5):
    print '%s - %s | (%s) | %s bpm' % (hdf5_getters.get_artist_name(h5), hdf5_getters.get_title(h5), hdf5_getters.get_year(h5), hdf5_getters.get_tempo(h5))