Пример #1
0
def read_challenge_json(path):
    challenges = util.read_dataset_json(path)

    for challenge in challenges:
        pid = challenge["pid"]

        num_samples = challenge["num_samples"]

        if "name" in challenge.keys():
            name = challenge["name"].strip()
        else:
            name = ""

        holdouts = []
        for holdout in challenge["holdouts"]:
            holdouts.append(holdout["track_uri"])

        tracks, albums, artists = [], [], []
        for track in challenge["tracks"]:
            tracks.append(track["track_uri"])
            albums.append(track["album_uri"])
            artists.append(track["artist_uri"])

        challenge_metadata[pid] = (num_samples, name, holdouts, tracks, albums,
                                   artists)

    print("Challenge file is read: %s" % path)
Пример #2
0
def build(mpd_path, output_path, k):
    count, current_k, partition_size = 0, 1, int(MPD_SIZE / k)

    files = listdir(mpd_path)
    random.shuffle(files)

    challenges = dict(playlists=[])

    for file in files:
        print("Processing %s" % file)

        items = util.read_dataset_json(join(mpd_path, file))
        random.shuffle(items)

        for item in items:
            count += 1
            category = util.random_category()
            playlist_json = dict(pid=item["pid"],
                                 name=item["name"],
                                 category=category["id"])

            if category["shuffle"]:
                random.shuffle(item["tracks"])

            num_tracks = len(item["tracks"])
            num_samples = int(num_tracks * category["fraction"])

            playlist_json["num_tracks"] = num_tracks
            playlist_json["num_samples"] = num_samples
            playlist_json["num_holdouts"] = num_tracks - num_samples

            playlist_json["tracks"] = sorted(item["tracks"][0:num_samples],
                                             key=lambda x: x["pos"],
                                             reverse=False)
            playlist_json["holdouts"] = sorted(item["tracks"][num_samples:],
                                               key=lambda x: x["pos"],
                                               reverse=False)

            challenges["playlists"].append(playlist_json)

            if count == partition_size:
                with open(
                        join(output_path,
                             "fold-{0:03d}.json".format(current_k)), "w") as f:
                    json.dump(challenges, f, indent=4)

                print("Fold %d is created with %d playlists" %
                      (current_k, len(challenges["playlists"])))

                count = 0
                current_k += 1

                del challenges["playlists"][:]

    print("%d-fold cv files are created in folder: %s" % (k, output_path))
Пример #3
0
def read_challenge_json(path):
    playlists = util.read_dataset_json(path)

    for playlist in playlists:
        pid = playlist["pid"]
        category = int(playlist["category"])
        holdouts = []

        for h in playlist["holdouts"]:
            holdouts.append(h["track_uri"])

        challenges[pid] = dict(category=category, holdouts=holdouts)

    print("\nChallenge file is read: %s" % path)
Пример #4
0
def process_dataset_json(path):
    playlists = util.read_dataset_json(path)

    for playlist in playlists:
        pid = playlist["pid"]

        for track in playlist["tracks"]:
            track_uri = track["track_uri"]
            album_uri = track["album_uri"]
            artist_uri = track["artist_uri"]

            if track_uri not in track_metadata.keys():
                track_metadata[track_uri] = dict(
                    track_name=track["track_name"],
                    album_uri=album_uri,
                    artist_uri=artist_uri,
                    duration=track["duration_ms"],
                    occurrence=1,
                    pids={pid})
            else:
                track_metadata[track_uri][
                    "occurrence"] = track_metadata[track_uri]["occurrence"] + 1
                track_metadata[track_uri]["pids"].add(pid)

            if album_uri not in album_metadata.keys():
                album_metadata[album_uri] = dict(
                    album_name=track["album_name"],
                    tracks={track_uri},
                    occurrence=1,
                    pids={pid})
            else:
                album_metadata[album_uri][
                    "occurrence"] = album_metadata[album_uri]["occurrence"] + 1
                album_metadata[album_uri]["tracks"].add(track_uri)
                album_metadata[album_uri]["pids"].add(pid)

            if artist_uri not in artist_metadata.keys():
                artist_metadata[artist_uri] = dict(
                    artist_name=track["artist_name"],
                    albums={album_uri},
                    tracks={track_uri},
                    occurrence=1,
                    pids={pid})
            else:
                artist_metadata[artist_uri]["occurrence"] = artist_metadata[
                    artist_uri]["occurrence"] + 1
                artist_metadata[artist_uri]["albums"].add(album_uri)
                artist_metadata[artist_uri]["tracks"].add(track_uri)
                artist_metadata[artist_uri]["pids"].add(pid)
Пример #5
0
def summarize(path, instance, verbose):
    print("\nFile: %s\n" % path)

    stats, summary = {}, []

    for playlist in util.read_dataset_json(path):
        cid = playlist["category"]
        c = util.search_category(instance, cid)

        if cid not in stats:
            stats[cid] = dict(instances=0, num_tracks=[], num_samples=[], num_holdouts=[])

        stats[cid]["instances"] += 1
        stats[cid]["num_tracks"].append(playlist["num_tracks"])
        stats[cid]["num_samples"].append(playlist["num_samples"])
        stats[cid]["num_holdouts"].append(playlist["num_holdouts"])

        if verbose == 1:
            print(" ".join([str(playlist["pid"]), str(cid), c["display"]]))

    total, all_tracks, all_samples, all_holdouts = 0, [], [], []

    for k, v in sorted(stats.items()):
        summary.append([k,
                        v["instances"],
                        statistics.mean(v["num_tracks"]),
                        statistics.mean(v["num_samples"]),
                        statistics.mean(v["num_holdouts"])])

        total += v["instances"]

        all_tracks.extend(v["num_tracks"])
        all_samples.extend(v["num_samples"])
        all_holdouts.extend(v["num_holdouts"])

    summary.append(["overall", total, statistics.mean(all_tracks), statistics.mean(all_samples), statistics.mean(all_holdouts)])

    print(tabulate(summary, headers=["category", "instances", "avg tracks", "avg samples", "avg holdouts"]))
Пример #6
0
def build(mpd_path, output_path, size, instance):
    required = len(ttv) * size

    files = listdir(mpd_path)
    random.shuffle(files)

    for file in files:
        if sum(len(v) for v in ttv.values()) == required:
            break

        print("Processing %s" % file)

        items = util.read_dataset_json(join(mpd_path, file))
        random.shuffle(items)

        for item in items:
            if instance == "recsys":
                mask_recsys(item, size)
            else:
                mask_custom(item, size)
            if sum(len(v) for v in ttv.values()) == required:
                break

    dump(output_path)