def analyse(movie_list, rating_list): """ Makes movie recommendation based on provided movie and their rating """ original_movies = movie_list input_ratings = rating_list all_movies = pd.read_csv(FILE_MOVIES_LABEL) input_movies = convert_movie_names(all_movies, original_movies) user_id = find_closest_user(input_movies, input_ratings) all_movies[USER_LABEL] = user_id model = keras.models.load_model(f"{MODEL_NAME}.h5") # type:keras.Model predictions = model.predict( [all_movies[MOVIE_LABEL], all_movies[USER_LABEL]]) all_movies[RATING] = predictions all_movies = all_movies.sort_values(by=RATING, ascending=False) all_movies = all_movies.head(10 + len(original_movies)) rating_map = dict(zip(all_movies[MOVIE_ID], all_movies[RATING])) all_movies = all_movies[MOVIE_ID].values mv = get_movies() mv = mv[mv[MOVIE_ID].isin(all_movies)] mv = mv[~mv[MOVIE_TITLE].isin(original_movies)] mv[RATING] = mv[MOVIE_ID].map(lambda x: rating_map[x]) mv = mv.sort_values(by=RATING, ascending=False) ratings = mv[RATING].values mv = mv[MOVIE_TITLE].values print("Recommended Movies [score]") for i, m in enumerate(zip(mv, ratings)): print(f"({i + 1}) {m[0]} [{round(m[1], 3)}]")
def convert_movie_names(all_movies, names): mv = get_movies() mv = dict(zip(mv[MOVIE_ID], mv[MOVIE_TITLE])) ml = dict(zip(mv.values(), mv.keys())) labels = dict(zip(all_movies[MOVIE_ID], all_movies[MOVIE_LABEL])) # Convert movie names to ids input_movies = [ml[x] for x in names] # Convert movie IDs to movie labels for x in input_movies: try: labels[x] except KeyError: raise KeyError(f"'{mv[x]}' is not available in current analysis. " f"Please use another movie") del mv return [labels[x] for x in input_movies]
def get_stats(): """ Generates basic statistics of the MovieLens database """ df = get_movies() df = df.fillna("") df[YEAR] = df[YEAR].map( lambda x: "".join([y for y in x if str(y).isnumeric()])) df = df[df[YEAR] != ""] print(f"Total Movies : {len(df)}") cats = df[MOVIE_GENRES].map(lambda x: x.split("|")).sum() print(f"Total Categories : {len(set(cats))}") print(f"From year {df[YEAR].min()} to {df[YEAR].max()}") df = pd.read_csv(FILE_RATINGS) print(f"Total ratings : {len(df)}") print(f"Total users : {df[USER_ID].nunique()}")
def generate_data(): # Your movie name desired_movie = "Jumanji" print(f"Analysis started for '{desired_movie}'") movies = get_movies() tags = add_tags(movies) ratings = add_ratings(tags) tag_dm = convert_to_distance(ratings, TAG) cat_dm = convert_to_distance(ratings, MOVIE_GENRES) ind = list(ratings[MOVIE_TITLE].values).index(desired_movie) generate_score(ratings, tag_dm[ind], cat_dm[ind]) np.savez_compressed("data/tag_matrix", tag_dm) np.savez_compressed("data/genre_matrix", cat_dm) ratings[RATING] = round(ratings[RATING], 4) ratings[[MOVIE_TITLE, RATING]].to_csv("data/names.csv", index=False) print("Analysis finished")
def plot_movies(): """ Generates histogram of movies available in MovieLens Database """ p = Palette() plt.rcParams['axes.facecolor'] = p.gray(shade=10) df = get_movies() df = df.fillna("") df = df[df[YEAR] != ""] df[YEAR] = df[YEAR].map( lambda x: "".join([y for y in x if str(y).isnumeric()])) df[YEAR] = pd.to_numeric(df[YEAR]) df = df[df[YEAR] < 3020] total_movies = len(df) df = df.groupby(by=YEAR).count().sort_index().reset_index() data = dict(zip(df[YEAR], df[MOVIE_ID])) plt.bar(range(0, len(data.keys())), data.values(), width=1, color=p.aqua(shade=40), zorder=3) labels = [int(x) for x in data.keys()][::5] plt.xticks(range(0, len(labels) * 5, 5), labels, rotation=90) plt.ylabel("Frequency") plt.xlabel("Year", labelpad=10) plt.grid(ls="--", color=p.gray(shade=30), zorder=-1) rc('text', usetex=True) plt.annotate(f"MovieLens Dataset\n" r"\small{(Total movies: " + str(total_movies) + r")}", (0.1, 0.8), xycoords="axes fraction", fontsize=13, bbox=dict(fc=p.white(), lw=0.1, pad=10)) plt.tight_layout() plt.savefig("plot.png", dpi=150) plt.show()
def prepare_data(): movies = get_movies() print(movies)