def main(): from model.conv_autoencoder import Autoencoder from neomodel import config, db import pandas as pd import os import numpy as np from ast import literal_eval as eval from tqdm import tqdm from model.graph import connection_url from model.graph.spotify.track import Track from model.graph.spotify.playlist import Playlist config.DATABASE_URL = connection_url() db.set_connection(connection_url()) # Segment analysis def segment_to_vector(segment): return list(map(abs, [segment['duration'], segment['loudness_start'], segment['loudness_max_time'], segment['loudness_max']] \ + segment['pitches'] + segment['timbre'])) stopval = len(Track.nodes) print('Number of tracks:', stopval) print('Number of playlists:', len(Playlist.nodes)) def get_minibatches(stopval, offset=0, interval=500): while offset < stopval: to_analyze: List[Track] = Track.get_songs_not_in_playlists( interval, offset=offset) yield to_analyze offset += len(to_analyze) print(f'{offset}/{stopval}') for i, x in enumerate(get_minibatches(stopval)): tracks = x X = [a.analysis['segments'] for a in x if a.analysis] arrs = [list(map(segment_to_vector, sample)) for sample in X] print(tracks) latest = list( filter(lambda x: 'segment_analysis' in x, os.listdir('models'))) if latest: latest = os.path.join('models', max(latest)) else: latest = None to_pad = 800 arrs = Autoencoder.pad_and_normalize(arrs, to_pad=to_pad) auto = Autoencoder.train('segment_analysis', arrs, weights_path=latest, epochs=10) df = pd.DataFrame({'uri': [a.uri for a in tracks]}) df = Autoencoder.store_zVectors(auto, arrs, df) for i, row in tqdm(df.iterrows()): track = Track.add_zVector(row['uri'], row['zVector'].tolist())
def add_zVector(cls, uri, data: List[List[float]]): ''' Adds zVector data to the track :param uri: the Spotify URI of the track :param data: the zVector data as a list of lists, not a numpy array :return: the updated node ''' # print(uri) db.set_connection(connection_url()) # query = 'MATCH (n: Track {uri: "%s"}) RETURN n LIMIT 1' % (uri) # results, meta = db.cypher_query(query=query) # track = [cls.inflate(row[0]) for row in results] # # if track: # track = track[0] # print(track.name) # print('Please save', track) # f**k neomodel ffs data = json.dumps({'zVector': data}) query = ''' MATCH (t:Track {uri:'%s'}) WITH t, properties(t) as snapshot SET t.zVector = '%s' RETURN snapshot ''' % (uri, data) # print(query) track = db.cypher_query(query) return track
def main(): import pandas as pd import os from ast import literal_eval from scraper.genius import Genius from scraper.spotify import Spotify from model.graph import connection_url from model.graph.billboard.track import Track from neomodel import db, clear_neo4j_database, config url = connection_url() print(url) config.DATABASE_URL = url db.set_connection(url) print('connected') clear_neo4j_database(db) BILLBOARD_DIR = os.path.join('output', 'billboard') weeks = os.listdir(BILLBOARD_DIR) weeks.sort(reverse=True) for week in weeks: df = pd.read_csv(os.path.join(BILLBOARD_DIR, week, 'main.csv')) for i, row in df.iterrows(): billboard_track = Track.inst(**dict(row)) print(billboard_track) # Sort artists by appearance in the title artists = ', '.join( list( map( lambda x: x['artist_name'], sorted(literal_eval(row['credited_artists']), key=lambda x: x['ordinal'])))) search_str = row['title'] + ' ' + artists genius_resp = Genius.search(search_str) print('\n\nSearching for:', search_str) if genius_resp['meta']['status'] != 200: raise ValueError( 'Probably exceeded Genius limits or invalid search') genius_resp = genius_resp['response']['hits'] for hit in genius_resp: hit = hit['result'] song_data = Genius.get_song(hit['id'])['response']['song'] if 'spotify' in [a['provider'] for a in song_data['media']]: print('Spotify exists!') for i, a in enumerate(song_data['media']): print(a) if a['provider'] == 'spotify': print('Spotify Exists -', song_data['full_title']) spotify_data = Spotify.get_track( song_data['media'][i]['native_uri']) print(spotify_data) break quit() print(weeks)
def get_playlists(spotify_id) -> List[Playlist]: ''' Gets playlists the given track appears in :param spotify_id: the Spotify ID of the track :return: the list of playlists that the track appears in ''' db.set_connection(connection_url()) results, meta = db.cypher_query( 'MATCH (t:Track {spotify_id: "%s"})-[:`FEATURED IN`]->(p:Playlist) RETURN p' % spotify_id) return [Playlist.inflate(row[0]) for row in results]
def find(cls, uri: str): ''' Finds a track given Spotify URI :param uri: the Spotify URI :return: the track ''' db.set_connection(connection_url()) results, meta = db.cypher_query(query=''' MATCH(t: Track {uri: "%s"}) RETURN t ''' % (uri)) return [cls.inflate(track[0]) for track in results]
def get_album(track_id: str): ''' Gets the album given the Spotify ID :param track_id: the Spotify ID of the track :return: the album it's on ''' db.set_connection(connection_url()) results, meta = db.cypher_query(query=''' MATCH(t: Track {spotify_id: "%s"})-[:FROM]->(a: Album) RETURN a ''' % (track_id)) return [Album.inflate(album[0]) for album in results]
def get_tracks(cls, spotify_id): s = time() from model.graph.spotify.track import SmallTrack db.set_connection(connection_url()) query = ''' MATCH (p: Playlist {spotify_id: "%s"}) <-[r1: `FEATURED IN`]- (t: Track) RETURN t.uri, t.zVector, t.spotify_id, t.name ''' % spotify_id results, meta = db.cypher_query(query) print('Results fetched', time() - s) keys = ['uri', 'zVector', 'spotify_id', 'name'] kwargs = [{keys[i]: val for i, val in enumerate(result)} for result in results] return [SmallTrack(**result) for result in kwargs]
def get_associated_artists(cls, artist_id): ''' Gets artists associated with the given artist :param artist_id: the Spotify ID of the given artist :return: the list of associated artists ''' db.set_connection(connection_url()) results, meta = db.cypher_query(''' MATCH (t) -[:`FEATURED IN`]-> (p: Playlist) MATCH (a: Artist {spotify_id: "%s"}) <-[r1: BY]- (t: Track) -[r2: BY]-> (similar_artists: Artist) RETURN similar_artists ''' % artist_id) return set([cls.inflate(artist[0]) for artist in results])
def get_similar_playlists(track_id: str, playlist_id: str) -> List[Playlist]: ''' Gets similar playlists to the given playlist :param track_id: the Spotify ID of the track currently playing :param playlist_id: the Spotify ID of the playlist currently playing :return: a list of similar playlists ''' query = ''' match (t: Track {spotify_id: "%s"}) -[r1:`FEATURED IN`]-> (p: Playlist {spotify_id: "%s"}) <-[r2:`FEATURED IN`]- (other_tracks: Track) -[r3:`FEATURED IN`]-> (similar_playlists: Playlist) return similar_playlists ''' % (track_id, playlist_id) db.set_connection(connection_url()) results, meta = db.cypher_query(query=query) return [Playlist.inflate(playlist[0]) for playlist in results]
def get_tracks_with_multiple_artists(cls, context, *artist_ids): ''' Gets tracks with multiple artists on it :param artist_ids: the given artist IDs :param context: the Spotify ID of the playlist :return: ''' artists = ','.join([ "(a%s:Artist { uri: '%s' })" % (i, a) for i, a in enumerate(artist_ids) ]) query_constructor = '''MATCH %s, p = allShortestPaths((a0)-[*]-(a1)) WHERE EXISTS((a0) <-[:BY]- (:Track) -[:`FEATURED IN`]-> (:Playlist {spotify_id: "%s"})) RETURN nodes(p) ''' % (artists, context) from model.graph.spotify.track import Track db.set_connection(connection_url()) results, meta = db.cypher_query(query_constructor) return [Track.inflate(result[0][1]) for result in results]
def main(): ''' Gets all the albums in the database, loops through associated tracks, and searches Genius - if Spotify URI is included in Genius data, the node is updated to include Genius data :return: ''' from time import sleep from neomodel import config from scraper.genius import Genius from model.graph import connection_url from model.graph.spotify.track import Track from model.graph.spotify.album import Album config.DATABASE_URL = connection_url() print(len(Album.nodes)) for i, node in enumerate(Album.nodes): if i > 5: break print('\n\nCurrent Album:', node.name) for track in node.tracks: to_search = f'{track.name} {track.artists[0].name}' print('\nSearching for', to_search) resp = Genius.search(to_search) print(resp) if resp['meta']['status'] == 200: resp = resp['response'] if 'hits' in resp: for hit in resp['hits']: song_data = Genius.get_song( hit['result']['id'])['response']['song'] print(song_data['media']) if 'spotify' in [ a['provider'] for a in song_data['media'] ]: print('Spotify exists!') for i, a in enumerate(song_data['media']): print(a) if a['provider'] == 'spotify': in_db = Track.nodes.get_or_none( uri=song_data['media'][i] ['native_uri']) if in_db: print('Track exists:', in_db.name) else: print('Track is not in database') Track.add_genius( song_data['media'][i]['native_uri'], song_data) new_track = Track.nodes.get_or_none( uri=song_data['media'][i] ['native_uri']) if new_track: print('Track updated', new_track.genius_data) else: print('Track not updated', new_track) sleep(1)
def main(): from neomodel import config, db from model.graph import connection_url from model.graph.spotify.track import Track from model.graph.spotify.playlist import Playlist from sklearn.neural_network import MLPClassifier from sklearn.linear_model import Perceptron, SGDClassifier from sklearn.multioutput import MultiOutputClassifier from sklearn.preprocessing import MultiLabelBinarizer from sklearn.preprocessing import normalize from sklearn.decomposition import LatentDirichletAllocation from sklearn.datasets import make_multilabel_classification from joblib import dump, load from tqdm import tqdm import numpy as np from math import log10 import os config.DATABASE_URL = connection_url() db.set_connection(connection_url()) stopval = len(Track.nodes) print(stopval) print('Playlists', len(Playlist.nodes)) playlists = Playlist.get_all() num_playlists = len(playlists) playlists = {node.uri: ind for ind, node in enumerate(playlists)} def get_minibatches(stopval, count=0, interval=20): while count < stopval: to_analyze: List[Track] = Track.get_songs_in_playlists( interval, count) X = [a.get_song_features(as_list=True) for a in to_analyze] y = [[playlists[x.uri] for x in Track.get_playlists(a.spotify_id)] for a in to_analyze] print(count, interval) yield np.array(list( map(lambda x: list(map(abs, x)), X))), MultiLabelBinarizer().fit_transform(y) count += len(to_analyze) lda = LatentDirichletAllocation(n_components=num_playlists) startval = 0 if len(os.listdir('trained_models')) > 0: startval = max(os.listdir('trained_models')) lda = load(os.path.join('trained_models', startval)) startval = int(startval.split('.')[0]) interval = 20 # get_minibatches(stopval, count=startval * interval) for i, val in enumerate( tqdm(get_minibatches(stopval, count=startval * interval))): i += startval + 1 X, y = val lda.partial_fit(X) dump( lda, os.path.join('trained_models', f'{str(i).zfill(int(log10(stopval) + 1))}.joblib')) os.remove( os.path.join( 'trained_models', f'{str(i - 1).zfill(int(log10(stopval) + 1))}.joblib'))