def ingest(self): # Browse MSD summary file es_bulk_docs = {} msd_id = "" sng_idx = 0 for h5_fd, sng_idx in self.track_generator.get_track(): msd_doc = {} msd_id = hdf5_getters.get_track_id(h5_fd, sng_idx) for getter in getters: field_name = getter.split("get_")[-1] msd_field_name = "msd_" + field_name # prefixed for ES storage try: msd_field_value = hdf5_getters.__getattribute__(getter)(h5_fd, sng_idx) # Type conversions msd_field_value = Ingestor.convert_type(msd_field_value) msd_doc[msd_field_name] = msd_field_value except AttributeError, e: logger.debug("ERROR. AttributeError. {}".format(e)) pass es_bulk_docs[msd_id] = msd_doc # Ingest bulk if size is enough if len(es_bulk_docs) == es_bulk_size: logger.debug("{} files read. Bulk ingest.".format(sng_idx + 1)) logger.debug("Last MSD id read: {}".format(msd_id)) self.es_helper.ingest_to_es(es_bulk_docs) es_bulk_docs = {}
def main(): if len(sys.argv) != 2: print ('Takes one argument, the directory with the data files.') return hdf5_files = get_all_files(sys.argv[1]) # Define properties to get properties = ['danceability', 'duration', 'end_of_fade_in', 'energy', 'key', 'loudness', 'mode', 'song_hotttnesss', 'start_of_fade_out', 'tempo', 'time_signature', 'year', 'artist_terms'] count_datapoints = 0 with open('data.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(properties) for abspath in hdf5_files: prop_values = [] h5 = hdf5_getters.open_h5_file_read(abspath) hotness_check = hdf5_getters.get_song_hotttnesss(h5) if math.isnan(hotness_check): h5.close() continue for prop in properties: prop_value = hdf5_getters.__getattribute__('get_' + prop)(h5) #print (prop_value) # special case artist terms to format it if prop == 'artist_terms': prop_values.append(';'.join(str(x) for x in prop_value)) else: prop_values.append(str(prop_value)) csvwriter.writerow(prop_values) count_datapoints += 1 h5.close(); print('Wrote % datapoints (lines)'.format(count_datapoints)) sys.exit(0)
def sanity_check_1thread(maindir=None, threadid=-1, nthreads=-1, allfiles=[]): """ Main function, check a bunch of files by opening every field in getter. """ assert not maindir is None, 'wrong param maindir' assert threadid > -1, 'wrong param threadid' assert nthreads > 0, 'wrong param nthreads' assert len(allfiles) > 0, 'wrong param allfiles, or no files' # get getters getters = filter(lambda x: x[:4] == 'get_', GETTERS.__dict__.keys()) # get the files to check files_per_thread = int(np.ceil(len(allfiles) * 1. / nthreads)) p1 = files_per_thread * threadid p2 = min(len(allfiles), files_per_thread * (threadid + 1)) # iterate over files between p1 and p2 for f in allfiles[p1:p2]: try: h5 = GETTERS.open_h5_file_read(f) for getter in getters: tmp = GETTERS.__getattribute__(getter)(h5) except KeyboardInterrupt: raise KeyboardInterruptError() except Exception, e: print 'PROBLEM WITH FILE:', f sys.stdout.flush() raise finally:
def sanity_check_1thread(maindir=None,threadid=-1,nthreads=-1,allfiles=[]): """ Main function, check a bunch of files by opening every field in getter. """ assert not maindir is None,'wrong param maindir' assert threadid>-1,'wrong param threadid' assert nthreads>0,'wrong param nthreads' assert len(allfiles)>0,'wrong param allfiles, or no files' # get getters getters = filter(lambda x: x[:4] == 'get_', GETTERS.__dict__.keys()) # get the files to check files_per_thread = int(np.ceil(len(allfiles) * 1. / nthreads)) p1 = files_per_thread * threadid p2 = min(len(allfiles),files_per_thread * (threadid+1)) # iterate over files between p1 and p2 for f in allfiles[p1:p2]: try: h5 = GETTERS.open_h5_file_read(f) for getter in getters: tmp = GETTERS.__getattribute__(getter)(h5) except KeyboardInterrupt: raise KeyboardInterruptError() except Exception,e: print 'PROBLEM WITH FILE:',f; sys.stdout.flush() raise finally:
def get(getters, h5file): # sanity check if not os.path.isfile(h5file): print 'ERROR: file', h5file, 'does not exist.' sys.exit(0) h5 = hdf5_getters.open_h5_file_read(h5file) numSongs = hdf5_getters.get_num_songs(h5) songidx = 0 if songidx >= numSongs: print 'ERROR: file contains only',numSongs h5.close() sys.exit(0) line = dict() for getter in getters: try: res = hdf5_getters.__getattribute__('get_' + getter)(h5,songidx) except AttributeError, e: print e if res.__class__.__name__ == 'ndarray': # print getter[4:]+": shape =",res.shape # How to put multidimensional values into file. # Try to put only mean of the values etc... print 'Ignoring....' else: # print getter[4:]+":",res line[getter] = res
def extractSongData(file_name, getters_to_apply): path = './canciones/' + file_name + '.h5' h5 = getters.open_h5_file_read(path) song = np.empty(0) for get in getters_to_apply: res = getters.__getattribute__(get)(h5) song = np.append(song, np.mean(res)) h5.close() return song
def extractValues(hdf5path, summary, fields): # summary = False songidx = 0 onegetter = '' # print hdf5path h5 = hdf5_getters.open_h5_file_read(hdf5path) # get all getters keys = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys()) getters = [] keys.remove("get_num_songs") # special case for onegetter in fields: if onegetter[:4] != 'get_': onegetter = 'get_' + onegetter #add get_ try: keys.index(onegetter) #find if keyval exists else exit except ValueError: print 'ERROR: getter requested:',onegetter,'does not exist.' h5.close() sys.exit(0) getters.append(onegetter) getters = np.sort(getters) retDict = {} # print them for getter in getters: try: res = hdf5_getters.__getattribute__(getter)(h5,songidx) except AttributeError, e: if summary: continue else: print e print 'forgot -summary flag? specified wrong getter?' #print getter + "\n" #print res if res.__class__.__name__ == 'float64': if math.isnan(res): res = Global_Constant if res.__class__.__name__ == 'ndarray': #print getter[4:]+": shape =",res.shape newlist = [] for i in res: newlist.append(i) #print newlist retDict[getter[4:]] = newlist else: retDict[getter[4:]] = res
def get_attributes(files, getters): for getter in getters: getter_func = hdf5_getters.__getattribute__(getter) attrib = [] for f in files: h5 = hdf5_getters.open_h5_file_read(f) attrib.append( getter_func(h5) ) h5.close() yield getter, attrib
def get_list_attr(path_list, attr): attr_list = [] i = 1 for file in path_list: try: file_read = hdf5_getters.open_h5_file_read(file) attr_list.append(hdf5_getters.__getattribute__(attr)(file_read)) file_read.close() print 'Finished ' + str(i) + '/2350' i += 1 except: print '---- Failed to get ' + file + ' ---- No:' + str(i) attr_list.append(0) i += 1 return (attr_list)
def get_song_info(song_path, pickle_path): #Create a dictionary with fields and dump in pickle data = {} data['pickle_id'] = get_song_id(song_path) #print data['pickle_id'] # get params hdf5path = song_path songidx = 0 onegetter = '' # if len(sys.argv) > 2: # songidx = int(sys.argv[2]) # if len(sys.argv) > 3: # onegetter = sys.argv[3] # sanity check if not os.path.isfile(hdf5path): print 'ERROR: file', hdf5path, 'does not exist.' sys.exit(0) h5 = hdf5_getters.open_h5_file_read(hdf5path) numSongs = hdf5_getters.get_num_songs(h5) if songidx >= numSongs: print 'ERROR: file contains only', numSongs h5.close() sys.exit(0) # get all getters getters = get_modified_getters() #print getters # print them for getter in getters: try: res = hdf5_getters.__getattribute__(getter)(h5, songidx) except AttributeError, e: if summary: continue else: print e print 'forgot -summary flag? specified wrong getter?' if res.__class__.__name__ == 'ndarray': print getter[4:] + ": shape =", res.shape else: data[getter[4:]] = str(res)
def main(): if len(sys.argv) != 2: print('Takes one argument, the directory with the data files.') return hdf5_files = get_all_files(sys.argv[1]) # Define properties to get properties = [ 'danceability', 'duration', 'end_of_fade_in', 'energy', 'key', 'loudness', 'mode', 'song_hotttnesss', 'start_of_fade_out', 'tempo', 'time_signature', 'year', 'artist_terms' ] count_datapoints = 0 with open('data.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(properties) for abspath in hdf5_files: prop_values = [] h5 = hdf5_getters.open_h5_file_read(abspath) hotness_check = hdf5_getters.get_song_hotttnesss(h5) if math.isnan(hotness_check): h5.close() continue for prop in properties: prop_value = hdf5_getters.__getattribute__('get_' + prop)(h5) #print (prop_value) # special case artist terms to format it if prop == 'artist_terms': prop_values.append(';'.join(str(x) for x in prop_value)) else: prop_values.append(str(prop_value)) csvwriter.writerow(prop_values) count_datapoints += 1 h5.close() print('Wrote % datapoints (lines)'.format(count_datapoints)) sys.exit(0)
def main(): rootdir = '/Users/Jerry/desktop/2017_Spring/COSI 132A/project/MillionSongSubset/data' number = 0 for subdir, dirs, files in os.walk(rootdir): for f in files: fileroot = os.path.join(subdir, f) h5 = hdf5_getters.open_h5_file_read(fileroot) # numSongs = hdf5_getters.get_num_songs(h5) # if numSongs>1: # print fileroot, numSongs, "\n" # above code has checked song is 1 for all h5 file getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys()) getters = np.sort(getters) songidx = 0 number = number + 1 song = {} # print them for getter in getters: try: res = hdf5_getters.__getattribute__(getter)(h5, songidx) except AttributeError, e: if summary: continue else: print e print 'forgot -summary flag? specified wrong getter?' if res.__class__.__name__ == 'ndarray': song[getter[4:]] = list(res) # print getter[4:]+": shape =",res.shape else: # print getter[4:]+":",res song[getter[4:]] = str(res) # song[getter[4:]] = res h5.close() music[number] = song
elif onegetter != '': if onegetter[:4] != 'get_': onegetter = 'get_' + onegetter try: getters.index(onegetter) except ValueError: print 'ERROR: getter requested:', onegetter, 'does not exist.' h5.close() sys.exit(0) getters = [onegetter] getters = np.sort(getters) # print them for getter in getters: try: res = hdf5_getters.__getattribute__(getter)(h5, songidx) except AttributeError, e: if summary: continue else: print e print 'forgot -summary flag? specified wrong getter?' if res.__class__.__name__ == 'ndarray': #print getter[4:]+": shape =",res.shape d[getter[4:]] = res.tolist() else: #print getter[4:]+":",res if res != res: res = None d[getter[4:]] = res
for filed in files: h5 = hdf5_getters.open_h5_file_read(root+"/"+filed) # get all getters keys_to_extract = ["get_song_id","get_title", "get_track_id", "get_artist_id", "get_artist_name", "get_duration", "get_year", "get_artist_location", "get_artist_familiarity", "get_artist_hotttnesss", "get_loudness"] getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys()) getters.remove("get_num_songs") # special case getters = list(set(getters).intersection(set(keys_to_extract))) getters.sort(key=lambda x: keys_to_extract.index(x)) # write extracted data to file csvstring = [] for getter in getters: try: res = hdf5_getters.__getattribute__(getter)(h5,songidx) if getter == "get_song_id": if res in song_dict: song_numeric_id = song_dict[res] else: ids += 1 song_numeric_id = song_dict[res] = ids csvstring.append(song_numeric_id) except AttributeError, e: continue if res.__class__.__name__ == 'ndarray': continue
song_order[song[0]] = song[1] outputDir = output_dir i = 0 hits = 0 for dirpath, dirnames, filenames in os.walk(input_dir): for track_file in filenames: #print track_file #song = re.split(r'[ ]', songs[i]) output = "<song xmlns=\'http://labrosa.ee.columbia.edu/millionsong/\'>\n" h5 = hdf5_getters.open_h5_file_read(os.path.join(dirpath, track_file)) song_id = hdf5_getters.get_song_id(h5) for getter in getters: try: res = hdf5_getters.__getattribute__(getter)(h5) except AttributeError, e: continue if res.__class__.__name__ == 'ndarray': output = output + "<" + getter[4:] + ">" + str( res.shape) + "</" + getter[4:] + ">\n" else: output = output + "<" + getter[4:] + ">" + str( res) + "</" + getter[4:] + ">\n" h5.close() if song_id in song_order: output = output + "<order>" + song_order[ song_id][:-1] + "</order>\n" logger.debug(track_file + ' HIT') hits = hits + 1
def transfer(h5path, matpath=None, force=False): """ Transfer an HDF5 song file (.h5) to a matfile (.mat) If there are more than one song in the HDF5 file, each field name gets a number happened: 1, 2, 3, ...., numfiles PARAM h5path - path to the HDF5 song file matpath - path to the new matfile, same as HDF5 path with a different extension by default force - if True and matfile exists, overwrite RETURN True if the file was transfered, False if there was a problem. Could also raise an IOException NOTE All the data has to be loaded in memory! be careful if one file contains tons of songs! """ # sanity checks if not os.path.isfile(h5path): print 'path to HF5 files does not exist:', h5path return False if not os.path.splitext(h5path)[1] == '.h5': print 'expecting a .h5 extension for file:', h5path return False # check matfile if matpath is None: matpath = os.path.splitext(h5path)[0] + '.mat' if os.path.exists(matpath): if force: print 'overwriting file:', matpath else: # print 'matfile',matpath,'already exists (delete or force):' return False # get all getters! we assume that all we need is in hdf5_getters.py # further assume that they have the form get_blablabla and that's the # only thing that has that form getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys()) getters.remove("get_num_songs") # special case # open h5 file h5 = hdf5_getters.open_h5_file_read(h5path) # transfer nSongs = hdf5_getters.get_num_songs(h5) matdata = { 'transfer_note': 'transferred on ' + time.ctime() + ' from file: ' + h5path } try: # iterate over songs for songidx in xrange(nSongs): # iterate over getter for getter in getters: gettername = getter[4:] if nSongs > 1: gettername += str(songidx + 1) data = hdf5_getters.__getattribute__(getter)(h5, songidx) matdata[gettername] = data except MemoryError: print 'Memory Error with file:', h5path print 'All data has to be loaded in memory before being saved as matfile' print 'Is this an aggregated / summary file with tons of songs?' print 'This code is optimized for files containing one song,' print 'but write me an email! (TBM)' raise finally: # close h5 h5.close() # create sio.savemat(matpath, matdata) # all good return True
if numSongs>1: print "Error: More than one song is included in file ", filename f.close() sys.exit(0) getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys()) getters.remove("get_num_songs") # special case getters = np.sort(getters) dict_get ={'get_track_id':'','get_track_7digitalid':'','get_title':'','get_artist_id':'','get_artist_7digitalid':'','get_artist_name':'','get_artist_hotttnesss':'','get_artist_latitude':'','get_artist_location':'','get_artist_longitude':'','get_danceability':'','get_duration':'','get_energy':'','get_loudness':'','get_release':'','get_release_7digitalid':'','get_song_hotttnesss':'','get_song_id':'','get_tempo':'','get_time_signature':'','get_time_signature_confidence':'','get_year':''} # print them for getter in getters: if getter in sel_get: try: dict_get[getter] = hdf5_getters.__getattribute__(getter)(h5,songidx) except AttributeError, e: continue f.write(str(dict_get['get_track_id']) + '\t') f.write(str(dict_get['get_track_7digitalid']) + '\t') f.write(str(dict_get['get_title']) + '\t') f.write(str(dict_get['get_artist_id']) + '\t') f.write(str(dict_get['get_artist_7digitalid']) + '\t') f.write(str(dict_get['get_artist_name']) + '\t') f.write(str(dict_get['get_artist_hotttnesss']) + '\t') f.write(str(dict_get['get_artist_latitude']) + '\t' ) f.write(str(dict_get['get_artist_location']) + '\t' ) f.write(str(dict_get['get_artist_longitude']) + '\t') f.write(str(dict_get['get_danceability']) + '\t' ) f.write(str(dict_get['get_duration']) + '\t' )
freq = {} #2-dim dict of dict's represented as freq[year][term] uniqueWordLst = [] #uniqueVector of unique words to loop yearLst = [] #Vector of all years for hdf5path in allh5: #params songidx = 0 #sanity check (use when dir scanning works) if not os.path.isfile(hdf5path): print 'ERROR: file',hdf5path,'does not exist.' continue #PRINT THE FREQUENCY LIST h5 = hdf5_getters.open_h5_file_read(hdf5path) artist_terms = hdf5_getters.__getattribute__('get_artist_terms')(h5,0) year = hdf5_getters.__getattribute__('get_year')(h5,0) #fill up the freq dict if(year != 0): freq[year] = [] if(not(year in yearLst)): yearLst.append(year) for words in artist_terms: words = words.split() for word in words: if(not(word in uniqueWordLst)): uniqueWordLst.append(word) incrWordFreq(freq[year], word) h5.close()
def transfer(h5path,matpath=None,force=False): """ Transfer an HDF5 song file (.h5) to a matfile (.mat) If there are more than one song in the HDF5 file, each field name gets a number happened: 1, 2, 3, ...., numfiles PARAM h5path - path to the HDF5 song file matpath - path to the new matfile, same as HDF5 path with a different extension by default force - if True and matfile exists, overwrite RETURN True if the file was transfered, False if there was a problem. Could also raise an IOException NOTE All the data has to be loaded in memory! be careful if one file contains tons of songs! """ # sanity checks if not os.path.isfile(h5path): print 'path to HF5 files does not exist:',h5path return False if not os.path.splitext(h5path)[1] == '.h5': print 'expecting a .h5 extension for file:',h5path return False # check matfile if matpath is None: matpath = os.path.splitext(h5path)[0] + '.mat' if os.path.exists(matpath): if force: print 'overwriting file:',matpath else: print 'matfile',matpath,'already exists (delete or force):' return False # get all getters! we assume that all we need is in hdf5_getters.py # further assume that they have the form get_blablabla and that's the # only thing that has that form getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys()) getters.remove("get_num_songs") # special case # open h5 file h5 = hdf5_getters.open_h5_file_read(h5path) # transfer nSongs = hdf5_getters.get_num_songs(h5) matdata = {'transfer_note':'transferred on '+time.ctime()+' from file: '+h5path} try: # iterate over songs for songidx in xrange(nSongs): # iterate over getter for getter in getters: gettername = getter[4:] if nSongs > 1: gettername += str(songidx+1) data = hdf5_getters.__getattribute__(getter)(h5,songidx) matdata[gettername] = data except MemoryError: print 'Memory Error with file:',h5path print 'All data has to be loaded in memory before being saved as matfile' print 'Is this an aggregated / summary file with tons of songs?' print 'This code is optimized for files containing one song,' print 'but write me an email! (TBM)' raise finally: # close h5 h5.close() # create sio.savemat(matpath,matdata) # all good return True
song_order[song[0]] = song[1] outputDir = output_dir i = 0 hits = 0 for dirpath, dirnames, filenames in os.walk(input_dir): for track_file in filenames: #print track_file #song = re.split(r'[ ]', songs[i]) output = "<song xmlns=\'http://labrosa.ee.columbia.edu/millionsong/\'>\n" h5 = hdf5_getters.open_h5_file_read(os.path.join(dirpath, track_file)) song_id = hdf5_getters.get_song_id(h5) for getter in getters: try: res = hdf5_getters.__getattribute__(getter)(h5) except AttributeError, e: continue if res.__class__.__name__ == 'ndarray': output = output + "<"+getter[4:]+">"+str(res.shape)+"</"+getter[4:]+">\n" else: output = output + "<"+getter[4:]+">"+str(res)+"</"+getter[4:]+">\n" h5.close() if song_id in song_order: output = output + "<order>" + song_order[song_id][:-1] + "</order>\n" logger.debug(track_file +' HIT') hits = hits + 1 if song_id in listen_dict: logger.debug("user listens: " + track_file) for user_listen in listen_dict[song_id]: