Пример #1
0
def popularity(data, pidtotracks=None, name="DefaultName"):
    if (pickle_utility.exists_file(name)):
        return pickle_utility.load(name)
    if (pidtotracks == None):
        pidtotracks = pickle_utility.load("pidtotracks")

    occurences = pickle_utility.load("trackOccurences")

    pop_dict = {}
    counter = 0

    print("Start popularity ranking...")
    for key, pid_predictions in data.items():
        track_pop_dict = Counter()
        for pid in pid_predictions:
            tracks = pidtotracks[int(pid)]
            for track in tracks.split():
                track_pop_dict[track] = occurences[track]
        pop_dict[key] = [i[0] for i in track_pop_dict.most_common(1000)]
        counter += 1
        if (counter % 100 == 0):
            print("Processed {} playlists".format(counter))

    pickle_utility.dump(pop_dict, name)
    return pop_dict
Пример #2
0
def getPopularityRankedPlaylists(playlists):
    print("Reading trackOccurences")
    trackOccCount = pickle_utility.load("trackOccurences")

    rankedSimilarTrackDict = dict()
    pidToPopDict = dict()
    count = 0
    for pid, similarPids in playlists.items():
        count += 1
        print(count)
        # count popularity of similar playlists for this pid
        for similarPid in similarPids:
            popularityCounter = 0
            tracks = pid_dict[int(float(similarPid))].split(" ")
            # print(len(tracks))
            for track in tracks:
                popularityCounter += trackOccCount[track]
            pidToPopDict[similarPid] = popularityCounter

        # rearrange the similar playlists for this pid so that the most popular are at the beginning
        sortedSimilarPlaylists = dict(
            sorted(pidToPopDict.items(),
                   key=operator.itemgetter(1),
                   reverse=True))
        # for k, v in sortedSimilarPlaylists.items():
        #    print(k, v)
        rankedSimilarTrackDict[pid] = list(sortedSimilarPlaylists.keys())
        pidToPopDict.clear()
        # for k, v in rankedSimilarTrackDict.items():
        #    print(k, v)
        if count == 100:
            break
    return rankedSimilarTrackDict
Пример #3
0
def getPopularityRankedTracks(playlists, name):
    trackPopularityDict = pickle_utility.load("trackOccurences")

    rankedTrackDict = dict()
    rankedSimilarTrackDict = dict()
    counter = 0
    for pid, similarPids in playlists.items():
        print (counter," / ", len(playlists.items()))
        counter+=1

        for similarPid in similarPids:

            tracks = pid_dict[int(float(similarPid))]
            tracks = tracks.split(" ")

            for track in tracks:
                if not track in rankedTrackDict:
                    rankedTrackDict[track] = trackPopularityDict[track]



        # rearrange the similar playlists for this pid so that the most popular are at the beginning
        sortedSimilarTracks = dict(sorted(rankedTrackDict.items(), key=operator.itemgetter(1), reverse=True))

        rankedSimilarTrackDict[pid] = list(sortedSimilarTracks.keys())
        sortedSimilarTracks.clear()

    pickle_utility.dump(rankedSimilarTrackDict, "rankedTracks", name)
    return rankedSimilarTrackDict
Пример #4
0
def popularity_cluster(data, caching=True):
    predictions = data.items()

    pidtotracks = pickle_utility.load("pidtotracks")

    occurences = pickle_utility.load("trackOccurences")

    popularity_dict = defaultdict(int)

    #p = Pool()
    counter = 0
    addProcesses = []
    pidProcesses = []

    caching_predictions = {}
    unique_tracks = set()

    mapping_dict = {}

    for key, prediction in predictions:
        prediction_set = set()
        mapping_dict[key] = prediction[0]
        for pid in prediction:
            tracks = pidtotracks[pid]
            prediction_set.add(tracks)

    print("Number of cached strings: {0}".format(
        len(caching_predictions.values())))
    processed_track_strings = set()
    for key, prediction in caching_predictions.items():
        track_set = set()
        for track_string in prediction:
            if (not track_string in processed_track_strings):
                for track in track_string.split():
                    track_set.add(track)
                processed_track_strings.add(track_string)
            else:
                print("Caching worked")
        caching_predictions[key] = track_set

    return popularity_dict
Пример #5
0
def getPopularityRankedPlaylists(playlists, name):
    pidPopularityDict = pickle_utility.load("pidPopularityDict")

    rankedPidDict = dict()
    rankedSimilarTrackDict = dict()
    for pid, similarPids in playlists.items():



        for similarPid in similarPids:
            rankedPidDict[similarPid] = pidPopularityDict[int(float(similarPid))]

        # rearrange the similar playlists for this pid so that the most popular are at the beginning
        sortedSimilarPlaylists = dict(sorted(rankedPidDict.items(), key=operator.itemgetter(1), reverse=True))

        rankedSimilarTrackDict[pid] = list(sortedSimilarPlaylists.keys())
        rankedPidDict.clear()

    pickle_utility.dump(rankedSimilarTrackDict, "rankedPlaylists", name)
    return rankedSimilarTrackDict
Пример #6
0
# another try for name clustering.
# playlist names are not splitted
# only remove special characters and cluster same playlists

import app_settings
import pickle_utility
import json
import readData
import re
from nltk.stem import WordNetLemmatizer

wordDict = pickle_utility.load("wordDict_namesNotSplitted")
lookupDict = pickle_utility.load("lookupDict")
wl = WordNetLemmatizer


def create_wordDict():
    words = dict()
    for k in lookupDict:
        playlist = k.split(" ")
        name = []
        out = ""
        for j in range(len(playlist) - 1):
            name.append(playlist[j])
        id = playlist[-1]
        for i in range(len(name)):
            tmp = name[i]
            tmp = re.sub("[^\w\s\_]", "", tmp)
            tmp = tmp.lower()
            out = out + tmp
        if out not in words:
Пример #7
0
# another try for name clustering.
# playlist names are not splitted
# only remove special characters and cluster same playlists

import pickle_utility
import app_settings
import json
import readData
import re
from nltk.stem import WordNetLemmatizer

wordDict = pickle_utility.load("wordDict_withoutIrrelevant")
#yearsDict = pickle_utility.load("yearsDict")
lookupDict = pickle_utility.load("lookupDict")
wl = WordNetLemmatizer
irrelevant = ["the", "playlist", "favorite", "favourite", "best", "top", "tracks", "mix" , "my"]
# years = []

def create_wordDict():
    words = dict()
    for k in lookupDict:
        playlist = re.split(' |_|-', k)
        name = []
        out = ""
        for j in range(len(playlist)-1):
            name.append(playlist[j])
        id = playlist[-1]
        for i in range(len(name)):
            tmp = name[i]
            tmp = re.sub("[^\w\s\_]", "", tmp)
            tmp = tmp.lower()
Пример #8
0
# import module_markov_500
import module_cluster
import copy
import itertools
from collections import Counter
import app_settings
import pickle_utility

# read
data = json.load(open(app_settings.CHALLENGE_SET))
playlists = data["playlists"]
originalTracks = dict()

# use pidtotracks for selecting actual tracks that should be in the submit file
print("Reading pidtotracks")
pid_dict = pickle_utility.load("pidtotracks")


def getPopularityRankedPlaylists(playlists):
    print("Reading trackOccurences")
    trackOccCount = pickle_utility.load("trackOccurences")

    rankedSimilarTrackDict = dict()
    pidToPopDict = dict()
    count = 0
    for pid, similarPids in playlists.items():
        count += 1
        print(count)
        # count popularity of similar playlists for this pid
        for similarPid in similarPids:
            popularityCounter = 0