def constructGraph( conn, p, song_features_table_name, artist_similarities_table_name, graph_table_name, debug_print=False ): cur = conn.cursor() MX_common.resetTable( conn, graph_table_name, ['targeto text', 'similaro text, dist real'] ) # read songs_list q = "SELECT \ song_id \ FROM "\ +song_features_table_name +";" cur.execute(q) songs_list = cur.fetchall() node_stats = dict() node_stats['neighbors_cur_artist'] = list() node_stats['neighbors_topo'] = list() node_stats['neighbors_metric'] = list() node_stats['mean_dist'] = list() node_stats['similar_artists'] = list() edge_stats = dict() edge_stats['dist'] = list() edge_stats['vdist'] = list() # for each song (the focal song), find its neighbors for focal_song_id in songs_list[::1]: focal_song_id = focal_song_id[0] if debug_print: print "focal_song: " + focal_song_id # fetch all focal song's features q = "SELECT \ * \ FROM "\ +song_features_table_name+" "\ "WHERE \ song_id='" + focal_song_id + "';" cur.execute(q) focal_song_f = cur.fetchall() # find all songs that are TOPOLOGICALLY close (based on artist similarity): # find all song of current artist q = "SELECT "\ +"song_id "\ +"FROM "\ +song_features_table_name+" "\ +"WHERE "\ +"artist_id='" + focal_song_f[0][p['invkey']['artist_id']] + "';" cur.execute(q) song_ids_cur_artist = cur.fetchall() node_stats['neighbors_cur_artist'].append(len(song_ids_cur_artist)) if debug_print: for song_id in song_ids_cur_artist: print "same artist songs: " + song_id[0] # find all artists similar to current artist q = "SELECT "\ +"similaro "\ +"FROM "\ +artist_similarities_table_name+" "\ +"WHERE "\ +"targeto='" + focal_song_f[0][p['invkey']['artist_id']] + "';" cur.execute(q) artist_ids_topo = cur.fetchall() node_stats['similar_artists'].append(len(artist_ids_topo)) song_ids_topo = list() # add all song IDs of the similar artists for artist_id in artist_ids_topo: # fetch all songs of this artist q = "SELECT "\ +"song_id "\ +"FROM "\ +song_features_table_name+" "\ +"WHERE "\ +"artist_id='" + artist_id[0] + "';" cur.execute(q) cur_song_ids = cur.fetchall() # if len(cur_song_ids)>0: # cur_song_ids = cur_song_ids[0] for song_id in cur_song_ids: song_ids_topo.append(song_id) # stage 2 - goo deeper into the tree of artist_similarities. This requires holding a fifo of nodes / some sort of recursion node_stats['neighbors_topo'].append(len(song_ids_topo)) if debug_print: for song_id in song_ids_topo: print "similar artists songs: " + song_id[0] # find all songs that are METRICALLY close (based on song features) song_ids_metric = list() for key in p['relevant_features']: key_range = ( focal_song_f[key] - p['unit_vec'][key]*p['construct_neighborhood_metric_diameter'] , focal_song_f[key] + ) q = "SELECT "\ +"song_id "\ +"FROM "\ +song_features_table_name+" "\ +"WHERE "\ +key+ ">="+ str(key_range[0]) + " AND "+ key+ "<="+ str(key_range[1])+";" cur.execute(q) cur_song_ids = cur.fetchall() for id in cur_song_ids[0]: song_ids_metric.append(id) node_stats['neighbors_metric'].append(len(song_ids_metric)) # combine all potential neighbors song IDs song_ids_combined = list(set(song_ids_cur_artist) | set(song_ids_topo) | set(song_ids_metric)) # for each potential neighbor, measure its distance from the focal point potential_neighbors = list() for neighbor_song_id in song_ids_combined: # print "song id: "+neighbor_song_id # fetch features for each potential neighbor q = "SELECT \ *\ FROM "\ +song_features_table_name+" "\ +"WHERE "\ +"song_id='" + neighbor_song_id[0] + "';" cur.execute(q) neighbor_song_f = cur.fetchall() # calculate distance to current song sdist = MX_traverse.calcDistance(focal_song_f[0], neighbor_song_f[0], p['distance_type_construct']) # filter songs that are too distant if sdist[0] < p['maximal_overall_distance']: potential_neighbors.append((neighbor_song_id[0],sdist[0], sdist[1])) # sort potential neighbors by distance # potential_neighbors = sorted(potential_neighbors, key=lambda x: x[1]) if len(potential_neighbors)>0: node_stats['mean_dist'].append( np.mean([x[1] for x in potential_neighbors]) ) else: node_stats['mean_dist'].append(-1) # add potential neighbors and truncate, if there are too many, based on distance for neighborhood_type in neighboorhoodtypes: for neighbor in potential_neighbors[neighborhood_type][0:min(max_MXG_neighbors[neighborhood_type], len(potential_neighbors[neighborhood_type]))]: # print "n0 " + neighbor[0] # print "n1 " + str(neighbor[1]) q = "INSERT INTO "\ +graph_table_name+" \ (targeto, similaro, dist) \ VALUES \ ('"+ focal_song_id +"', '"+ neighbor[0] +"', '"+ str(neighbor[1]) +"');" cur.execute(q) q = "INSERT INTO "\ +graph_table_name+" \ (targeto, similaro, dist) \ VALUES \ ('"+ neighbor[0] +"', '"+ focal_song_id +"', '"+ str(neighbor[1]) +"');" cur.execute(q) edge_stats['dist'].append( neighbor[1] ) edge_stats['vdist'].append( neighbor[2] ) # MXG_target.append( song_id[0] ) # MXG_source.append( potential_neighbors[0] ) return node_stats, edge_stats
def constructGraph( song_features_table_name, artist_similarities_table_name, graph_table_name, debug_print=False ): #graph_song_list_table_name, global p SFstats = MX_common.initUnitVec( song_features_table_name ) g_conn = psy.connect( MX_common.g_db_conn_command ) g_conn.autocommit = True g_cur = g_conn.cursor() # read song_id list q = "SELECT \ song_id \ FROM "\ +song_features_table_name +";" g_cur.execute(q) songs_list = g_cur.fetchall() node_stats = dict() node_stats['neighbors_cur_artist'] = list() node_stats['neighbors_topo'] = list() node_stats['neighbors_metric'] = list() node_stats['mean_dist'] = list() node_stats['similar_artists'] = list() edge_stats = dict() edge_stats['dist'] = list() edge_stats['vdist'] = list() MX_common.resetTable( g_cur, graph_table_name, ['targeto text', 'similaro text'] ) # for each song (the focal song), find its neighbors for focal_song_id in songs_list[::1]: #::100 focal_song_id = focal_song_id[0] if debug_print: print "focal_song: " + focal_song_id # fetch all focal song's features q = "SELECT \ * \ FROM "\ +song_features_table_name+" "\ "WHERE \ song_id='" + focal_song_id + "';" g_cur.execute(q) focal_song_f = g_cur.fetchall() # find all songs that are TOPOLOGICALLY close (based on artist similarity): potential_neighbors = dict() potential_neighbors_dist = dict() nbkey = 'topo' if nbkey in p['neighborhood_types']: potential_neighbors[nbkey] = list() # find all songs of current artist q = "SELECT "\ +"song_id "\ +"FROM "\ +song_features_table_name+" "\ +"WHERE "\ +"artist_id='" + focal_song_f[0][p['invkey']['artist_id']] + "';" g_cur.execute(q) song_ids_cur_artist = g_cur.fetchall() song_ids_cur_artist = [x[0] for x in song_ids_cur_artist] node_stats['neighbors_cur_artist'].append(len(song_ids_cur_artist)) if debug_print: for song_id in song_ids_cur_artist: print "same artist songs: " + song_id[0] # find all artists similar to current artist q = "SELECT "\ +"similaro "\ +"FROM "\ +artist_similarities_table_name+" "\ +"WHERE "\ +"targeto='" + focal_song_f[0][p['invkey']['artist_id']] + "';" g_cur.execute(q) artist_ids_topo = g_cur.fetchall() node_stats['similar_artists'].append(len(artist_ids_topo)) song_ids_topo = list() # add all songs of similar artists for artist_id in artist_ids_topo: # fetch all songs of this artist q = "SELECT "\ +"song_id "\ +"FROM "\ +song_features_table_name+" "\ +"WHERE "\ +"artist_id='" + artist_id[0] + "';" g_cur.execute(q) cur_song_ids = g_cur.fetchall() for song_id in cur_song_ids: song_ids_topo.append(song_id[0]) # TBD - goo deeper into the tree of artist_similarities. This requires holding a fifo of nodes / some sort of recursion node_stats['neighbors_topo'].append(len(song_ids_topo)) if debug_print: for song_id in song_ids_topo: print "similar artists songs: " + song_id[0] # combine all potential TOPOLOGICAL neighbors potential_neighbors[nbkey] = list(set(song_ids_cur_artist) | set(song_ids_topo)) # find all songs that are METRICALLY close (based on song features) nbkey = 'features' if nbkey in p['neighborhood_types']: potential_neighbors[nbkey] = list() for key in p['relevant_features']: radius = abs(p['unit_vec_inv'][p['invkey'][key]]) * p['construct_neighborhood_metric_diameter'] key_range = ( focal_song_f[0][p['invkey'][key]] - radius , focal_song_f[0][p['invkey'][key]] + radius ) q = "SELECT "\ +"song_id "\ +"FROM "\ +song_features_table_name+" "\ +"WHERE "\ +key+ ">="+ str(key_range[0]) + " AND "+ key+ "<="+ str(key_range[1]) + " "\ +"ORDER BY \ RANDOM() \ LIMIT "\ + str(p['max_feature_neighborhood_size']) +";" g_cur.execute(q) cur_song_ids = g_cur.fetchall() for id in cur_song_ids: potential_neighbors[nbkey].append(id[0]) node_stats['neighbors_metric'].append(len(potential_neighbors[nbkey])) # for each potential neighbor, measure its distance from the focal point for nbkey in p['neighborhood_types']: potential_neighbors_dist[nbkey] = list() for neighbor_song_id in potential_neighbors[nbkey]: # fetch features for each potential neighbor q = "SELECT \ *\ FROM "\ +song_features_table_name+" "\ +"WHERE "\ +"song_id='" + neighbor_song_id + "';" g_cur.execute(q) neighbor_song_f = g_cur.fetchall() # calculate distance to current song sdist = MX_common.calcDistance(focal_song_f[0], neighbor_song_f[0], p['distance_type_construct']) potential_neighbors_dist[nbkey].append((neighbor_song_id,sdist[0], sdist[1])) # sort potential neighbors by distance potential_neighbors_dist[nbkey] = sorted(potential_neighbors_dist[nbkey], key=lambda x: x[1]) if len(potential_neighbors_dist[nbkey])>0: node_stats['mean_dist'].append( np.mean([x[1] for x in potential_neighbors_dist[nbkey]]) ) else: node_stats['mean_dist'].append(-1) print "#Feature NBs = " + str(len(potential_neighbors_dist[nbkey])) + ", Topo same artist NBs = " + str(len(song_ids_cur_artist)) + ", Topo similar artists NBs = " + str(len(song_ids_topo)) # add potential neighbors and truncate, if there are too many, based on distance for nbkey in p['neighborhood_types']: for neighbor in potential_neighbors_dist[nbkey][0:min(p['max_MXG_neighbors'][nbkey], len(potential_neighbors_dist[nbkey]))]: # add both direction of the edge (later on we call a function that filters duplicate edges) q = "INSERT INTO "\ +graph_table_name+" \ (targeto, similaro) \ VALUES \ ('"+ focal_song_id +"', '"+ neighbor[0] +"');" g_cur.execute(q) q = "INSERT INTO "\ +graph_table_name+" \ (targeto, similaro) \ VALUES \ ('"+ neighbor[0] +"', '"+ focal_song_id +"');" g_cur.execute(q) edge_stats['dist'].append( neighbor[1] ) edge_stats['vdist'].append( neighbor[2] ) g_conn.close() return SFstats, node_stats, edge_stats
def pickNextSongs( cur, p, cur_song_id, mag, conservation=None, debug_print=0 ): # fetch current song features q = "SELECT \ * \ FROM "\ +MX_common.song_features_table_name+" "\ "WHERE \ song_id='" + cur_song_id + "';" cur.execute(q) cur_song_f = cur.fetchall() #cur = conn.cursor() verbose = 1 cur_song_desc = printSongDescription( cur_song_f[0], verbose ) # Update recently-played FIFO to prevent frequent repeats: # push current song into recently-played FIFOs: # history FIFO p['recently_played'].appendleft( p['currently_playing'] ) # p['recently_played'].appendleft(cur_song_id) if len(p['recently_played']) > p['recently_played_num']: p['recently_played'].pop() p['currently_playing'] = (cur_song_id, cur_song_desc) # avoid-replay FIFO p['avoid_recent'].appendleft(cur_song_id) if len(p['avoid_recent']) > p['recent_to_avoid_num']: p['avoid_recent'].pop() t_scale = p['unit_vec_inv'][p['invkey_distance_features']] t_cur_f = np.array([cur_song_f[0][i] for i in p['invkey_distance_features']]) if conservation != None: t_cons_f = np.array([conservation[key] for key in p['distance_features']]) else: t_cons_f = np.array([0.0 for key in p['distance_features']]) # previous logic of similarity - degenerated # t_angle = (1-t_cons_f) / np.linalg.norm(1-t_cons_f) # # calculate the ideal location of the next song (in terms of the relevant distance features) # t_next_f = t_cur_f + mag*t_angle*t_scale # fetch current song neighbors (=potentials for the next song) q = "SELECT \ similaro\ FROM "\ +MX_common.MXG_table_name+" "\ +"WHERE "\ +"targeto='" + cur_song_id + "';" cur.execute(q) neighbor_songs = cur.fetchall() # filter out duplicates neighbor_songs = list(set(neighbor_songs)) print "#neighbors = " + str(len(neighbor_songs)) # filter out recently-played songs neighbor_songs = [x for x in neighbor_songs if x[0] not in p['avoid_recent'] ] print "#neighbors (- recent) = " + str(len(neighbor_songs)) # for each neighbor, fetch features and calc weighted distance to current song nb_disted = list() p['debug_neighbors'] = list() print "#neighbors: " + str(len(neighbor_songs)) for neighbor in neighbor_songs: # fetch features of all neighbors q = "SELECT \ *\ FROM "\ +MX_common.song_features_table_name+" "\ +"WHERE "\ +"song_id='" + neighbor[0] + "';" cur.execute(q) neighbor_song_f = cur.fetchall() # for each neighbor, calculate its distance from the ideal location t_neighbor_f = np.array([neighbor_song_f[0][i] for i in p['invkey_distance_features']]) sdist = MX_common.calcDistanceInner(t_neighbor_f, t_cur_f, t_scale, t_cons_f, 'L2') # log neighbor's detailsfor debug purposes nb_disted.append((sdist[0], neighbor[0], printSongDescription( neighbor_song_f[0], 1 ))) trip = list() for i in range(len(t_cons_f)): trip.append( ( p['invkey_distance_features'][i], p['distance_features'][i], t_cons_f[i], t_neighbor_f[i], t_cur_f[i] ) ) print trip nb_disted = sorted(nb_disted) # randomly pick the next song according to probabilities which correspond to the distance between current and neighboring songs epsilon = 0.0001 xk = np.arange(len(nb_disted)) pk = (np.array([x[0] for x in nb_disted]) + epsilon) ** p['noisyness'] pk /= sum(pk) print (pk[:5]), sum(pk) custm = stats.rv_discrete(name='custm', values=(xk, pk)) p['debug_neighbors'] = list() for i in range(len(nb_disted)): desc = "0 %4.4f" % pk[i] p['debug_neighbors'].append(desc +" "+ nb_disted[i][2]) # randomize a few potential next songs: # pick one, null its probability, resample and so on ii = list() ii.append(custm.rvs(size=1)) p['debug_neighbors'][ii[-1]] = str(1) + p['debug_neighbors'][ii[-1]][1:] print "i0 = " + str(xk[ii[-1]]), (pk[:5]), sum(pk) for j in range(min(len(nb_disted),p['suggestions_num'])-1): pk[ii[-1]] = 0 pk /= sum(pk) custm = stats.rv_discrete(name='custm', values=(xk, pk)) ii.append(custm.rvs(size=1)) print "i"+ str(j+1) +" = " + str(xk[ii[-1]]), (pk[:5]), sum(pk) p['debug_neighbors'][ii[-1]] = str(2+j) + p['debug_neighbors'][ii[-1]][1:] next_song_ids = [ nb_disted[i][1] for i in ii] return next_song_ids
dfsongs['valence:real'] = dfsongs['valence:real'].fillna(-1); dfsongs['instrumentalness:real'] = dfsongs['instrumentalness:real'].fillna(-1); # write to csv file as an intermediate pd.DataFrame.to_csv(dfsongs, tmp_csv_songs_file, index=False) # open PostgreSQL conn_out = psy.connect("dbname='my_db_test' user='******' host='localhost' password='' port=8787") conn_out.autocommit = True cur = conn_out.cursor() MX_common.importFromDbCsv(conn_out, tmp_csv_songs_file, MX_common.song_features_table_name) cur.execute("CREATE INDEX "\ +MX_common.song_features_song_id_index_name +" \ ON "\ +MX_common.song_features_table_name +" \ (song_id);") cur.execute("CREATE INDEX "\ +MX_common.song_features_track_id_index_name +" \ ON "\ +MX_common.song_features_table_name +" \ (track_id);") cur.execute("CREATE INDEX "\ +MX_common.song_features_7digital_id_index_name +" \ ON "\ +MX_common.song_features_table_name +" \