min_songs_hour = 10

output_listeningHistoryFolder = "../data/process/listeningHistory-hours/"

def doExperiment(lisHistFile, songsetFile):
  df_lisHist = pd.read_csv(lisHistFile,delimiter=";", encoding="UTF-8") #"ISO-8859-1")
  print(df_lisHist.head(1))
  sys.exit()
  

contents_lh = glob(f"{listeningHistoryFolder}*.csv")
contents_lh.sort()
for listeningHistoryFile in contents_lh:
  #df = pd.read_csv(listeningHistoryFile,delimiter=";", encoding="UTF-8")
  #print(df.head())
  df = mylib.loadData(listeningHistoryFile, min_msPlayed, delimiter="\t")
  df.sort_values(by=["date","time","msPlayed"], inplace=True)
  
  # Aggiungo una colonna con l'orario giusto
  #df["MY_PLAYED_AT"] = [pd.Timestamp('2017-01-01T12') for i in range(0,df.shape[0])]
  df["played_at"] = [" " for i in range(0,df.shape[0])]
  df["datetime"] = [" " for i in range(0,df.shape[0])]
  for index, row in df.iterrows():
    # vado allo scadere del minuto e sottraggo i secondi
    ##df.at[index, "MY_PLAYED_AT"] = row["endTime"] + pd.Timedelta(seconds=(59-row["msPlayed"]//1000))
    #df.at[index, "MY_PLAYED_AT"] = row["endTime"] + pd.Timedelta(seconds=59) - pd.Timedelta(milliseconds=row["msPlayed"])
    played_at_timestamp = row["endTime"] + pd.Timedelta(seconds=59) - pd.Timedelta(milliseconds=row["msPlayed"])
    
    # Problema: non stampa secondi e millisecondi se sono tutti 0
    #timeiso = played_at_timestamp.isoformat()
    #df.at[index, "played_at"] = timeiso[:-3] + "Z"
示例#2
0
# In[] load 1M data
import mylib


[data_] = mylib.loadData('../1Mtrain')
train_data = data_[:900000]
valid_data = data_[900000:]
# In[] calculate with a lamda
from collections import defaultdict
from math import exp
import numpy

def getRui(data):
    Rui = {};
    for d in data:
        Rui[(d['reviewerID'], d['itemID'])] = d['rating']
    return Rui

def getIu_Ui(data):
    Iu, Ui = {},{}
    for d in data:
        if d['reviewerID'] not in Iu:
            Iu[d['reviewerID']]= []
        if d['itemID'] not in Ui:
            Ui[d['itemID']] = []
        Iu[d['reviewerID']].append(d['itemID'])
        Ui[d['itemID']].append(d['reviewerID'])
    return [Iu, Ui]

Rui = getRui(train_data)
[Iu, Ui] = getIu_Ui(train_data)
示例#3
0
def generateSongset(csv_file,
                    output_folder,
                    cluster_method="KM",
                    heuristic_method="LINEAR",
                    min_songs_hour=10,
                    min_ms_played=10000,
                    max_clusters=10,
                    num_tracks=100):

    print(csv_file)
    if ".csv" not in csv_file and ".tsv" not in csv_file:
        print("ERROR: only tsv and csv input files are allowed. Skip file.")

    cluster_method = cluster_method.upper()
    heuristic_method = heuristic_method.upper()

    df = mylib.loadData(csv_file, min_ms_played, delimiter="\t")

    # 3.1
    ntna_ntka = computeNTNA_NTKA(df)

    for time_hour in range(0, 24):
        if time_hour < 21:
            continue

        # 3.2 - FILTERING
        df_h = songs_byHour(df, time_hour)

        # Remove duplicate songs
        df_h.drop_duplicates(subset="TrackID", keep="first", inplace=True)

        # controllo su numero di canzoni nella fascia oraria
        if df_h.shape[0] < min_songs_hour:
            print(f"* hour {time_hour}: skip ({df_h.shape[0]} songs).")
            continue
        print(f"* hour {time_hour}: {df_h.shape[0]} songs.")

        #3.3 - CLUSTERING
        df_h_feat = df_h[[
            "Acousticness", "Danceability", "Energy", "Instrumentalness",
            "Key", "Liveness", "Loudeness", "Mode", "Speechiness", "Tempo",
            "Time_signature", "Valence"
        ]]

        if cluster_method == "KM":
            best_clustering = best_k_means(
                df_h_feat, max_clusters,
                "exclude_K_less_4_songs")  #"exclude_cluster_less_4_songs")
            #print(best_clustering)
        elif cluster_method == "FBF":
            print(
                f"ERROR in generateSongset(): {cluster_method} not yet implemented. Exit."
            )
            sys.exit()
        else:
            print(
                f"ERROR in generateSongset(): cluster method {cluster_method} not defined. Exit."
            )
            sys.exit()

        kLength = best_clustering["best-length"]
        numReqs_perPoint = int(num_tracks / (4 * kLength)) + 1
        feature_names = [
            "Acousticness", "Danceability", "Energy", "Speechiness",
            "Instrumentalness", "Liveness", "Valence", "Loudeness", "Tempo",
            "Time_signature", "Key", "Mode"
        ]

        ########################
        ### LINEAR HEURISTIC ###
        ########################
        if heuristic_method == "LINEAR":
            linear_kMeans = linearHeuristic(df_h, best_clustering)
            ###########################
            ### RECOMMENDER SPOTIFY ###
            ###########################
            results = list()
            kIndex = 0
            for df_group in linear_kMeans:
                # FIRST SONG
                print(
                    f"\t{time_hour}) CLUSTER KM #{kIndex}/{kLength} - SONG #0/4"
                )
                firstPoint = df_group.iloc[0]
                trackId = firstPoint["TrackID"]
                # get features
                centroidFeatures_list = firstPoint["kCentroid"]
                features = dict()
                for index in range(0, len(centroidFeatures_list)):
                    features[
                        feature_names[index]] = centroidFeatures_list[index]
                tracks = recommenderGetSongs(trackId,
                                             features,
                                             numReqs_perPoint,
                                             results,
                                             retryLimit=2,
                                             sleepTime=recommender_sleepTime)
                results.extend(tracks)
                #print(len(results))
                #res = recommenderGetSongs("7CDaY0pk8qGFoahgxVVbaX", numReqs_perPoint, list(), retryLimit=2, sleepTime=recommender_sleepTime)
                # OTHER THREE SONGS
                for i in range(1, 4):
                    print(
                        f"\t{time_hour}) CLUSTER KM LINEAR #{kIndex}/{kLength} - SONG #{i}/4"
                    )
                    point = df_group.iloc[i]
                    trackId = point["TrackID"]
                    tracks = recommenderGetSongs(
                        trackId,
                        point,
                        numReqs_perPoint,
                        results,
                        retryLimit=2,
                        sleepTime=recommender_sleepTime)
                    results.extend(tracks)
                    #print(len(results))
                kIndex += 1

        ########################
        ### SPHERE HEURISTIC ###
        ########################
        elif heuristic_method == "SPHERE":
            sphere_kMeans = sphereHeuristic(df_h, best_clustering)

            ###########################
            ### RECOMMENDER SPOTIFY ###
            ###########################
            results = list()
            for item in sphere_kMeans:
                print(
                    f"\t{time_hour}) {kLength} CLUSTER KM SPHERE - SONG #{item['index']}/{len(sphere_kMeans)}"
                )
                #{"index": minDistSongIndex, "randomPoint": currRandomPoint, "minDistSong": minDistSong, "minDist": minDist}
                randomPoint = item["randomPoint"]
                features = dict()
                for index in range(0, len(randomPoint)):
                    features[feature_names[index]] = randomPoint[index]
                #for index in range(0,len(centroidFeatures_list)):
                #  features[feature_names[index]] = centroidFeatures_list[index]
                clusterMinDistSong = item["minDistSong"]
                trackId = clusterMinDistSong["TrackID"]
                tracks = recommenderGetSongs(trackId,
                                             features,
                                             numReqs_perPoint,
                                             results,
                                             retryLimit=2,
                                             sleepTime=recommender_sleepTime)
                results.extend(tracks)
        else:
            print(
                f"ERROR in generateSongset(): heuristic {cluster_method} not defined. Exit."
            )
            sys.exit()

        #print(results)
        output_file_start = ntpath.basename(csv_file).replace(".csv",
                                                              "").replace(
                                                                  ".tsv", "")
        saveSongset(results, output_folder, output_file_start, time_hour,
                    kLength, cluster_method, heuristic_method)
            pairs_Rating.append(UIO)
    rating_test_predict = []
    for pr in pairs_Rating:
        if pr[0] in dirty_u.keys():
            rating_test_predict.append(dirty_u[pr[0]])
        elif pr[1] in dirty_i.keys():
            rating_test_predict.append(dirty_i[pr[1]])
        else:
            rating_test_predict.append(alpha+beta_u[pr[0]]+beta_i[pr[1]])
    rating_test_result = [[pr[0]+'-'+pr[1], str(ptr)] for pr,ptr in zip(pairs_Rating,rating_test_predict)]
    
    saveCSV('rating_test_result.csv',rating_test_result)
    
# In[]================================================================================
# In[] main
[data_] = mylib.loadData('./assignment1/data/1Mtrain')
train_data = data_[:900000]
valid_data = data_[900000:]
del data_

dirty_limits = [10,11,12,13,14,15,16,17,18,19,20]
sds = [0.3,0.5,0.7,0.9,1.0,2.0]
dirty_bound = [[2.6,4.4],[2.8,4.2],[3,4],[3.5,3.5]]
MSEs = [[[0 for dbi in range(len(dirty_bound))] for si in range(len(sds))] for dli in range(len(dirty_limits))]
thetas = [[[0 for dbi in range(len(dirty_bound))] for si in range(len(sds))] for dli in range(len(dirty_limits))]
for dli in range(len(dirty_limits)):
    dl = dirty_limits[dli]
    for si in range(len(sds)):
        s = sds[si]
        for dbi in range(len(dirty_bound)):
            db = dirty_bound[dbi]
示例#5
0
"""
Created on Thu Nov 12 22:56:19 2015

@author: ssc317
"""

import mylib
import csv
filename = '1M_train_rating'
pairs_Rating = []
f = open('./pairs_Rating.txt')
for line in f:
    if line.startswith('userID'):
        pass
    elif line.startswith('U'):
        UIO = line.split('-')
        pairs_Rating.append(UIO)
[alpha, beta_u, beta_i] = mylib.loadData(filename)
rating_test_predict = [alpha+beta_u[pr[0]]+beta_i[pr[1]] for pr in pairs_Rating]
rating_test_result = [[pr[0]+'-'+pr[1].rstrip(), str(ptr)] for pr,ptr in zip(pairs_Rating,rating_test_predict)]


def saveCSV(filename, data):
    f =  open(filename, 'wb') 
    writer = csv.writer(f)
    writer.writerow(['userID-itemID', 'prediction'])
    for d in data:
        writer.writerow(d)
    f.close()
saveCSV('rating_test_result.csv',rating_test_result)