Пример #1
0
def median_eval_setup(request):
    """
    Sets:
        * _data["user_ratings_test"]
        * _process_data["movie_medians_train"]
    """
    # data for worker server
    _data["user_ratings_test"] = movie_lens_data.get_input_obj(
        "user_ratings_test")

    # data for worker processors
    movie_medians_train = movie_lens_data.get_input_obj("movie_medians_train")
    _proc.send_same_data({"movie_medians_train": movie_medians_train})
Пример #2
0
def als_eval_setup(request):
    """
    Sets:
        * _data["user_ratings_test"]
        * _process_data["movie_medians_train"]
        * _process_data["num_item_factors"]
        * _process_data["als_user_factors"]
        * _process_data["als_user_ids"]
        * _process_data["als_movie_factors"]
        * _process_data["als_movie_ids"]
    """
    factor = request["setup_param"]
    als_prefix = "als" + str(factor) + "_"

    # data for worker server
    _data["user_ratings_test"] = movie_lens_data.get_als_obj(
        als_prefix + "user_ratings_test")

    # data for worker processors
    movie_medians_train = movie_lens_data.get_input_obj("movie_medians_train")
    als_user_factors = movie_lens_data.get_als_obj(als_prefix + "user_factors")
    als_user_ids = movie_lens_data.get_als_obj(als_prefix + "user_ids")
    als_movie_factors = movie_lens_data.get_als_obj(als_prefix +
                                                    "item_factors")
    als_movie_ids = movie_lens_data.get_als_obj(als_prefix + "movie_ids")

    _proc.send_same_data({
        "movie_medians_train": movie_medians_train,
        "num_item_factors": factor,
        "als_user_factors": als_user_factors,
        "als_user_ids": als_user_ids,
        "als_movie_factors": als_movie_factors,
        "als_movie_ids": als_movie_ids
    })
Пример #3
0
def tag_count_eval_setup(request):
    """
    Sets:
        * _data["user_ratings_train"]
        * _data["user_ratings_test"]
        * _process_data["movie_genres"]
        * _process_data["movie_tags"]
        * _process_data["tag_counts"]
        * _process_data["genre_counts"]
        * _process_data["movie_medians_train"]
    """
    # data for worker server
    _data["user_ratings_train"] = movie_lens_data.get_input_obj(
        "user_ratings_train")
    _data["user_ratings_test"] = movie_lens_data.get_input_obj(
        "user_ratings_test")

    # data for worker processors
    movie_genres = movie_lens_data.get_input_obj("movie_genres")
    movie_tags = movie_lens_data.get_input_obj("movie_tags")
    tag_counts = movie_lens_data.get_input_obj("tag_counts")
    genre_counts = movie_lens_data.get_input_obj("genre_counts")
    movie_medians_train = movie_lens_data.get_input_obj("movie_medians_train")

    _proc.send_same_data({
        "movie_genres": movie_genres,
        "movie_tags": movie_tags,
        "tag_counts": tag_counts,
        "genre_counts": genre_counts,
        "movie_medians_train": movie_medians_train
    })
Пример #4
0
def build_similar_movies_db_setup(request):
    """
    Sets:
        * _process_data["buff_point"]
        * _process_data["buff_limit"]
        * _process_data["movie_ratings"]
        * _process_data["movie_genres"]
    """
    # collect the necessary data
    buff_point = request["setup_param"]["buff_point"]
    buff_limit = request["setup_param"]["buff_limit"]

    movie_ratings = movie_lens_data.get_input_obj("movie_ratings")
    movie_genres = movie_lens_data.get_input_obj("movie_genres")

    # send data to worker processors
    _proc.send_same_data({
        "buff_point": buff_point,
        "buff_limit": buff_limit,
        "movie_ratings": movie_ratings,
        "movie_genres": movie_genres
    })
def main():
    # redirect output to file to avoid Unicode printing errors
    print('see output in "title_search_output.txt"')
    sys.stdout = open("title_search_output.txt", "w", encoding="utf-8")

    # process command line arguments
    overwrite = False

    for arg in sys.argv:
        if arg == "overwrite":
            overwrite = True

    # movie_titles is {movie_id: title}
    movie_lens_data.read_movies_csv(overwrite)
    movie_titles = movie_lens_data.get_input_obj("movie_titles")

    # get "index", either load it from disk, or rebuild it
    index = None
    index_file_name = movie_lens_data.out_dir + "title_search_index.bin"

    if os.path.exists(index_file_name) and overwrite == False:
        config = IndexerConfig()
        index = Index(config, index_file_name)
    else:
        # build an "Index" object using "movie_titles"
        config = IndexerConfig()
        index = Index(config, None)
        index.build(movie_titles, index_file_name)

    # print properties of the index
    index.print_frequent_tokens(100)
    index.print_frequent_bigrams(50)
    index.print_non_alpha_num_words()

    index.print_words_with_ending("s")
    index.print_words_with_ending("ing")
    index.print_words_with_ending("ion")

    # print some searches with the index
    search(index, "star war", movie_titles)  # should match "star wars"
    search(index, "star trek 2", movie_titles)  # should match "star trek ii"
    search(index, "battle of gods dragon ball",
           movie_titles)  # prioritize "dragon ball" movies
    search(index, "ad", movie_titles)  # should match "a.d"
    search(index, "shield", movie_titles)  # should match "S.H.I.E.L.D."
Пример #6
0
def main():
    length = movie_lens_data.get_input_obj("user_ratings_test_length")

    # Sends a "tag_ls_eval" command to cluster nodes.
    if cluster.cluster_info is None:
        print("Cannot connect to cluster.")
        return

    cluster.send_command({
        "op": "distribute",
        "worker_op": "tag_ls_eval",
        "length": length
    })

    cluster.wait_for_completion()
    cluster.print_status()
    print()

    file_name = "tag_ls_eval_results.bin"
    results = cluster.merge_list_results(file_name)
    user_ids, agreements = zip(*results)

    my_util.print_rank_agreement_results(agreements, "tag least squares")
    print('\a')
def main():
    # process command line arguments
    overwrite = False
    cpu_count = None

    for arg in sys.argv:
        if arg == "overwrite":
            overwrite = True
        elif arg.startswith("cpu_count="):
            cpu_count = int(arg.split(sep='=')[1])

    similar_movies_file_name = config.out_dir + "similar_movies.bin"

    # exit if object already exists
    if overwrite == False and os.path.exists(similar_movies_file_name):
        print(similar_movies_file_name, "already exists")
        return

    movie_lens_data.start_time = time.time()

    # The SimilarMovieFinder class is tuned using:
    # movie id 1196 - Star Wars: Episode V - The Empire Strikes Back
    # movie id 1210 - Star Wars: Episode VI - Return of the Jedi
    movie_id1 = 1196
    movie_id2 = 1210

    # check movie_id titles - to be sure that the movie_id is still valid
    movie_lens_data.read_movies_csv()
    movie_titles = movie_lens_data.get_input_obj("movie_titles")

    if movie_titles[movie_id1].lower().find("empire strikes back") < 0:
        print('Movie ID', movie_id1, 'no longer "empire strikes back",',
              "cannot continue")
        return

    if movie_titles[movie_id2].lower().find("return of the jedi") < 0:
        print('Movie ID', movie_id2, 'no longer "return of the jedi",',
              "cannot continue")
        return

    # create SimilarMovieFinder and tune
    movie_ratings = movie_lens_data.create_movie_ratings(overwrite)
    movie_genres = movie_lens_data.get_input_obj("movie_genres")
    movie_finder = SimilarMovieFinder(movie_genres, movie_ratings)
    movie_finder.tune(movie_id1, movie_id2, 2, 20)

    print("SimilarMovieFinder tuning results in buff_point =",
          movie_finder.buff_point, "buff_limit =", movie_finder.buff_limit)

    print()
    print("Movies similar to \"" + movie_titles[movie_id1] + "\":")

    movie_id1_index = movie_finder.find_movie_index(movie_id1)
    movie_ids, _ = movie_finder.find_similar_movie(movie_id1_index)

    for movie_id in movie_ids:
        print(movie_titles[movie_id])

    # build database of similar movies
    print()
    print("Starting to build database of similar movies")

    if cluster.cluster_info is None:
        build_locally(movie_genres, movie_ratings, movie_finder.buff_point,
                      movie_finder.buff_limit, similar_movies_file_name,
                      cpu_count)
    else:
        build_with_cluster(movie_finder.buff_point, movie_finder.buff_limit,
                           len(movie_ratings), "similar_movies.bin")
 def __init__(self):
     self.similar_movies = movie_lens_data.get_output_obj("similar_movies")
     movie_lens_data.read_movies_csv()
     self.movie_titles = movie_lens_data.get_input_obj("movie_titles")
Пример #9
0
def main():
    # process command line arguments
    overwrite = False
    training_set_ratio = 0.8
    cpu_count = None
    als_thread_count = None
    algorithm = 1

    for arg in sys.argv:
        if arg == "overwrite":
            overwrite = True
        elif arg.startswith("training_set_ratio="):
            training_set_ratio = float(arg.split(sep='=')[1])
        elif arg.startswith("cpu_count="):
            cpu_count = int(arg.split(sep='=')[1])
        elif arg.startswith("als_thread_count="):
            als_thread_count = int(arg.split(sep='=')[1])
        elif arg.startswith("algorithm="):
            algorithm = int(arg.split(sep='=')[1])

    # Start extra processes and shrink the data set to meet ALS factor
    # requirements.
    _proc.start_processes(cpu_count)

    als_factors_list = [3, 5, 7, 9, 11]

    user_ratings = movie_lens_data.create_user_ratings(overwrite)

    if training_set_ratio >= 1:
        # Use the whole data set as training set.
        # "movie_lens_data.als_data_set_shrink_mp(...)" will assume
        # "user_ratings_train" and "user_ratings_test" to be in process memory.
        _proc.split_list_and_send(user_ratings, "user_ratings_train")
        _proc.send_same_data({"user_ratings_test": None})

        # compute movie medians, save to disk as "movie_medians_full.bin"
        median_file_name = movie_lens_data.in_dir + os.sep + "movie_medians_full.bin"

        if os.path.exists(median_file_name) and overwrite == False:
            movie_medians = movie_lens_data.get_input_obj("movie_medians_full")

        else:
            movie_ratings = movie_lens_data.create_movie_ratings(overwrite)

            print("        Computing movie medians for the full data set")
            _proc.split_list_and_send(movie_ratings, "movie_ratings")
            _proc.run_function("_compute_medians2", {})
            movie_medians = _proc.update_var_into_dict("movie_medians")

            print("        Saving", median_file_name)
            with open(median_file_name, mode="bw") as file:
                pickle.dump(movie_medians, file)

        movie_lens_data.als_data_set_shrink_mp(movie_medians,
                                               als_factors_list,
                                               no_test_set=True)

    else:
        # training_set_ratio < 1
        movie_medians_train = movie_lens_data.refresh_training_sets_mp(
            user_ratings, training_set_ratio)

        movie_lens_data.als_data_set_shrink_mp(movie_medians_train,
                                               als_factors_list)

    _proc.end_processes()
    gc.collect()

    movie_lens_data.als_train(als_factors_list, als_thread_count, algorithm)

    # print run time
    run_time = int(time.time() - movie_lens_data.start_time)
    print("Total run time", datetime.timedelta(seconds=run_time))
    print('\a')