def popularity(data, pidtotracks=None, name="DefaultName"): if (pickle_utility.exists_file(name)): return pickle_utility.load(name) if (pidtotracks == None): pidtotracks = pickle_utility.load("pidtotracks") occurences = pickle_utility.load("trackOccurences") pop_dict = {} counter = 0 print("Start popularity ranking...") for key, pid_predictions in data.items(): track_pop_dict = Counter() for pid in pid_predictions: tracks = pidtotracks[int(pid)] for track in tracks.split(): track_pop_dict[track] = occurences[track] pop_dict[key] = [i[0] for i in track_pop_dict.most_common(1000)] counter += 1 if (counter % 100 == 0): print("Processed {} playlists".format(counter)) pickle_utility.dump(pop_dict, name) return pop_dict
def getPopularityRankedPlaylists(playlists): print("Reading trackOccurences") trackOccCount = pickle_utility.load("trackOccurences") rankedSimilarTrackDict = dict() pidToPopDict = dict() count = 0 for pid, similarPids in playlists.items(): count += 1 print(count) # count popularity of similar playlists for this pid for similarPid in similarPids: popularityCounter = 0 tracks = pid_dict[int(float(similarPid))].split(" ") # print(len(tracks)) for track in tracks: popularityCounter += trackOccCount[track] pidToPopDict[similarPid] = popularityCounter # rearrange the similar playlists for this pid so that the most popular are at the beginning sortedSimilarPlaylists = dict( sorted(pidToPopDict.items(), key=operator.itemgetter(1), reverse=True)) # for k, v in sortedSimilarPlaylists.items(): # print(k, v) rankedSimilarTrackDict[pid] = list(sortedSimilarPlaylists.keys()) pidToPopDict.clear() # for k, v in rankedSimilarTrackDict.items(): # print(k, v) if count == 100: break return rankedSimilarTrackDict
def getPopularityRankedTracks(playlists, name): trackPopularityDict = pickle_utility.load("trackOccurences") rankedTrackDict = dict() rankedSimilarTrackDict = dict() counter = 0 for pid, similarPids in playlists.items(): print (counter," / ", len(playlists.items())) counter+=1 for similarPid in similarPids: tracks = pid_dict[int(float(similarPid))] tracks = tracks.split(" ") for track in tracks: if not track in rankedTrackDict: rankedTrackDict[track] = trackPopularityDict[track] # rearrange the similar playlists for this pid so that the most popular are at the beginning sortedSimilarTracks = dict(sorted(rankedTrackDict.items(), key=operator.itemgetter(1), reverse=True)) rankedSimilarTrackDict[pid] = list(sortedSimilarTracks.keys()) sortedSimilarTracks.clear() pickle_utility.dump(rankedSimilarTrackDict, "rankedTracks", name) return rankedSimilarTrackDict
def popularity_cluster(data, caching=True): predictions = data.items() pidtotracks = pickle_utility.load("pidtotracks") occurences = pickle_utility.load("trackOccurences") popularity_dict = defaultdict(int) #p = Pool() counter = 0 addProcesses = [] pidProcesses = [] caching_predictions = {} unique_tracks = set() mapping_dict = {} for key, prediction in predictions: prediction_set = set() mapping_dict[key] = prediction[0] for pid in prediction: tracks = pidtotracks[pid] prediction_set.add(tracks) print("Number of cached strings: {0}".format( len(caching_predictions.values()))) processed_track_strings = set() for key, prediction in caching_predictions.items(): track_set = set() for track_string in prediction: if (not track_string in processed_track_strings): for track in track_string.split(): track_set.add(track) processed_track_strings.add(track_string) else: print("Caching worked") caching_predictions[key] = track_set return popularity_dict
def getPopularityRankedPlaylists(playlists, name): pidPopularityDict = pickle_utility.load("pidPopularityDict") rankedPidDict = dict() rankedSimilarTrackDict = dict() for pid, similarPids in playlists.items(): for similarPid in similarPids: rankedPidDict[similarPid] = pidPopularityDict[int(float(similarPid))] # rearrange the similar playlists for this pid so that the most popular are at the beginning sortedSimilarPlaylists = dict(sorted(rankedPidDict.items(), key=operator.itemgetter(1), reverse=True)) rankedSimilarTrackDict[pid] = list(sortedSimilarPlaylists.keys()) rankedPidDict.clear() pickle_utility.dump(rankedSimilarTrackDict, "rankedPlaylists", name) return rankedSimilarTrackDict
# another try for name clustering. # playlist names are not splitted # only remove special characters and cluster same playlists import app_settings import pickle_utility import json import readData import re from nltk.stem import WordNetLemmatizer wordDict = pickle_utility.load("wordDict_namesNotSplitted") lookupDict = pickle_utility.load("lookupDict") wl = WordNetLemmatizer def create_wordDict(): words = dict() for k in lookupDict: playlist = k.split(" ") name = [] out = "" for j in range(len(playlist) - 1): name.append(playlist[j]) id = playlist[-1] for i in range(len(name)): tmp = name[i] tmp = re.sub("[^\w\s\_]", "", tmp) tmp = tmp.lower() out = out + tmp if out not in words:
# another try for name clustering. # playlist names are not splitted # only remove special characters and cluster same playlists import pickle_utility import app_settings import json import readData import re from nltk.stem import WordNetLemmatizer wordDict = pickle_utility.load("wordDict_withoutIrrelevant") #yearsDict = pickle_utility.load("yearsDict") lookupDict = pickle_utility.load("lookupDict") wl = WordNetLemmatizer irrelevant = ["the", "playlist", "favorite", "favourite", "best", "top", "tracks", "mix" , "my"] # years = [] def create_wordDict(): words = dict() for k in lookupDict: playlist = re.split(' |_|-', k) name = [] out = "" for j in range(len(playlist)-1): name.append(playlist[j]) id = playlist[-1] for i in range(len(name)): tmp = name[i] tmp = re.sub("[^\w\s\_]", "", tmp) tmp = tmp.lower()
# import module_markov_500 import module_cluster import copy import itertools from collections import Counter import app_settings import pickle_utility # read data = json.load(open(app_settings.CHALLENGE_SET)) playlists = data["playlists"] originalTracks = dict() # use pidtotracks for selecting actual tracks that should be in the submit file print("Reading pidtotracks") pid_dict = pickle_utility.load("pidtotracks") def getPopularityRankedPlaylists(playlists): print("Reading trackOccurences") trackOccCount = pickle_utility.load("trackOccurences") rankedSimilarTrackDict = dict() pidToPopDict = dict() count = 0 for pid, similarPids in playlists.items(): count += 1 print(count) # count popularity of similar playlists for this pid for similarPid in similarPids: popularityCounter = 0