def read_challenge_json(path): challenges = util.read_dataset_json(path) for challenge in challenges: pid = challenge["pid"] num_samples = challenge["num_samples"] if "name" in challenge.keys(): name = challenge["name"].strip() else: name = "" holdouts = [] for holdout in challenge["holdouts"]: holdouts.append(holdout["track_uri"]) tracks, albums, artists = [], [], [] for track in challenge["tracks"]: tracks.append(track["track_uri"]) albums.append(track["album_uri"]) artists.append(track["artist_uri"]) challenge_metadata[pid] = (num_samples, name, holdouts, tracks, albums, artists) print("Challenge file is read: %s" % path)
def build(mpd_path, output_path, k): count, current_k, partition_size = 0, 1, int(MPD_SIZE / k) files = listdir(mpd_path) random.shuffle(files) challenges = dict(playlists=[]) for file in files: print("Processing %s" % file) items = util.read_dataset_json(join(mpd_path, file)) random.shuffle(items) for item in items: count += 1 category = util.random_category() playlist_json = dict(pid=item["pid"], name=item["name"], category=category["id"]) if category["shuffle"]: random.shuffle(item["tracks"]) num_tracks = len(item["tracks"]) num_samples = int(num_tracks * category["fraction"]) playlist_json["num_tracks"] = num_tracks playlist_json["num_samples"] = num_samples playlist_json["num_holdouts"] = num_tracks - num_samples playlist_json["tracks"] = sorted(item["tracks"][0:num_samples], key=lambda x: x["pos"], reverse=False) playlist_json["holdouts"] = sorted(item["tracks"][num_samples:], key=lambda x: x["pos"], reverse=False) challenges["playlists"].append(playlist_json) if count == partition_size: with open( join(output_path, "fold-{0:03d}.json".format(current_k)), "w") as f: json.dump(challenges, f, indent=4) print("Fold %d is created with %d playlists" % (current_k, len(challenges["playlists"]))) count = 0 current_k += 1 del challenges["playlists"][:] print("%d-fold cv files are created in folder: %s" % (k, output_path))
def read_challenge_json(path): playlists = util.read_dataset_json(path) for playlist in playlists: pid = playlist["pid"] category = int(playlist["category"]) holdouts = [] for h in playlist["holdouts"]: holdouts.append(h["track_uri"]) challenges[pid] = dict(category=category, holdouts=holdouts) print("\nChallenge file is read: %s" % path)
def process_dataset_json(path): playlists = util.read_dataset_json(path) for playlist in playlists: pid = playlist["pid"] for track in playlist["tracks"]: track_uri = track["track_uri"] album_uri = track["album_uri"] artist_uri = track["artist_uri"] if track_uri not in track_metadata.keys(): track_metadata[track_uri] = dict( track_name=track["track_name"], album_uri=album_uri, artist_uri=artist_uri, duration=track["duration_ms"], occurrence=1, pids={pid}) else: track_metadata[track_uri][ "occurrence"] = track_metadata[track_uri]["occurrence"] + 1 track_metadata[track_uri]["pids"].add(pid) if album_uri not in album_metadata.keys(): album_metadata[album_uri] = dict( album_name=track["album_name"], tracks={track_uri}, occurrence=1, pids={pid}) else: album_metadata[album_uri][ "occurrence"] = album_metadata[album_uri]["occurrence"] + 1 album_metadata[album_uri]["tracks"].add(track_uri) album_metadata[album_uri]["pids"].add(pid) if artist_uri not in artist_metadata.keys(): artist_metadata[artist_uri] = dict( artist_name=track["artist_name"], albums={album_uri}, tracks={track_uri}, occurrence=1, pids={pid}) else: artist_metadata[artist_uri]["occurrence"] = artist_metadata[ artist_uri]["occurrence"] + 1 artist_metadata[artist_uri]["albums"].add(album_uri) artist_metadata[artist_uri]["tracks"].add(track_uri) artist_metadata[artist_uri]["pids"].add(pid)
def summarize(path, instance, verbose): print("\nFile: %s\n" % path) stats, summary = {}, [] for playlist in util.read_dataset_json(path): cid = playlist["category"] c = util.search_category(instance, cid) if cid not in stats: stats[cid] = dict(instances=0, num_tracks=[], num_samples=[], num_holdouts=[]) stats[cid]["instances"] += 1 stats[cid]["num_tracks"].append(playlist["num_tracks"]) stats[cid]["num_samples"].append(playlist["num_samples"]) stats[cid]["num_holdouts"].append(playlist["num_holdouts"]) if verbose == 1: print(" ".join([str(playlist["pid"]), str(cid), c["display"]])) total, all_tracks, all_samples, all_holdouts = 0, [], [], [] for k, v in sorted(stats.items()): summary.append([k, v["instances"], statistics.mean(v["num_tracks"]), statistics.mean(v["num_samples"]), statistics.mean(v["num_holdouts"])]) total += v["instances"] all_tracks.extend(v["num_tracks"]) all_samples.extend(v["num_samples"]) all_holdouts.extend(v["num_holdouts"]) summary.append(["overall", total, statistics.mean(all_tracks), statistics.mean(all_samples), statistics.mean(all_holdouts)]) print(tabulate(summary, headers=["category", "instances", "avg tracks", "avg samples", "avg holdouts"]))
def build(mpd_path, output_path, size, instance): required = len(ttv) * size files = listdir(mpd_path) random.shuffle(files) for file in files: if sum(len(v) for v in ttv.values()) == required: break print("Processing %s" % file) items = util.read_dataset_json(join(mpd_path, file)) random.shuffle(items) for item in items: if instance == "recsys": mask_recsys(item, size) else: mask_custom(item, size) if sum(len(v) for v in ttv.values()) == required: break dump(output_path)