def analyze_user(user_id, overwrite): """Analyze a user, to find out which predictor works best for them. :param user_id: A user ID. The user for which an analysis is being performed. :param overwrite: If a user has already been analyzed, should the analysis be overwritten? :returns: Nothing. """ with common.get_db_conn() as conn: # What if a user already has a predictor? if not overwrite and conn.execute( 'SELECT * FROM predictors WHERE userId=?', (user_id,) ).fetchall(): return # Create a personalized predictor for this user. SQLite added # support for UPSERT in version 3.24.0, which was released on # 2018-06-24. See: https://www.sqlite.org/lang_UPSERT.html sses = calc_sse(user_id) predictor = min_sse(sses) with conn: conn.execute( """ INSERT INTO predictors VALUES (?, ?) ON CONFLICT (userId) DO UPDATE SET predictor=? """, (user_id, predictor, predictor) )
def similarity(movie_a, movie_b): """Return the similarity score for the two given movies. :param movie_a: A movie ID. :param movie_b: A movie ID. :return: The similarity score for the pair of movies. :raise movie_recommender.exceptions.MissingSimilarityError: If no similarity score has been computed for this pair of movies. """ movies = [movie_a, movie_b] movies.sort() with common.get_db_conn() as conn: row = conn.execute( """ SELECT similarity FROM similarities WHERE movieAId=? AND movieBId=? """, movies, ).fetchone() if not row: raise exceptions.MissingSimilarityError(f""" A similarity score hasn't been computed for movies {movies[0]} and {movies[1]}. """) return row[0]
def rating_pairs(movie_a, movie_b): """Count the number of rating pairs for the given movies. See :meth:`movie_recommender.db.read.rating_pairs`. :param movie_a: A movie ID. :param movie_b: A movie ID. :return: An integer. The number of movie rating pairs for the given movies. """ if movie_a == movie_b: raise ValueError(f""" Fetching pairs of ratings for a movie and itself is disallowed. Movie IDs: {movie_a}, {movie_b} """) movies = [movie_a, movie_b] movies.sort() with common.get_db_conn() as conn: row = conn.execute( """ SELECT COUNT (*) FROM ( SELECT userId, rating FROM ratings WHERE movieId = ? ) movieARatings INNER JOIN ( SELECT userId, rating FROM ratings WHERE movieId = ? ) movieBRatings WHERE movieARatings.userId = movieBRatings.userId """, movies, ).fetchone() return row[0]
def similar_movies_for_user(movie, user): """Yield movies similar to ``movie`` that ``user`` has rated. .. NOTE:: A "similar" movie is one with a non-zero similarity score. This includes negative similarity scores! :param movie: A movie ID. :param user: A user ID. :return: A generator yielding tuples of the form ``(movie_id, similarity)``. """ rated_movies_ = rated_movies((user, )) with common.get_db_conn() as conn: # There's probably some clever technique for expressing the following # queries as a single SQL query. for row in conn.execute( """ SELECT movieAId, similarity FROM similarities WHERE movieBId = ? AND similarity != 0 """, (movie, )): if row[0] in rated_movies_: yield row for row in conn.execute( """ SELECT movieBId, similarity FROM similarities WHERE movieAId = ? AND similarity != 0 """, (movie, )): if row[0] in rated_movies_: yield row
def all_movies(): """Yield the IDs of every movie. :return: A generator that yields movie IDs. """ with common.get_db_conn() as conn: for row in conn.execute('SELECT DISTINCT movieId FROM movies'): yield row[0]
def user_ids(): """Count the number of users in the current dataset. :return: An integer. """ with common.get_db_conn() as conn: return conn.execute( 'SELECT COUNT(DISTINCT userId) FROM ratings').fetchone()[0]
def avg_ratings(): """Count the number of entries in the "avgRatings" table. :return: An integer. """ with common.get_db_conn() as conn: return conn.execute( 'SELECT COUNT(DISTINCT userId) FROM avgRatings').fetchone()[0]
def users(): """Get the ID of every user. :return: A tuple of user IDs. """ with common.get_db_conn() as conn: return { row[0] for row in conn.execute('SELECT DISTINCT userId FROM ratings') }
def users_in_avg_ratings(): """Get the ID of every user in the avgRatings table. :return: A tuple of user IDs. """ with common.get_db_conn() as conn: return { row[0] for row in conn.execute('SELECT DISTINCT userId FROM avgRatings') }
def genres(movie_id): """Get the genres of the given movie. :param movie_id: A movie ID. :return: An iterable of genres, as strings. """ with common.get_db_conn() as conn: genres_strings = tuple(row[0] for row in conn.execute( 'SELECT genres FROM movies WHERE movieId=?', (movie_id, ))) assert len(genres_strings) == 1 return genres_strings[0].split('|')
def unrated_movies(user_id): """Count the number of movies the given user hasn't rated. :param user_id: A user ID. :return: An integer. """ with common.get_db_conn() as conn: return conn.execute( """ SELECT COUNT(DISTINCT movieId) FROM movies WHERE movieId NOT IN (SELECT DISTINCT movieId FROM ratings WHERE userId=?) """, (user_id, )).fetchone()[0]
def title(movie_id): """Get the title of the given movie. :param movie_id: A movie ID. :return: The title of the given movie. """ with common.get_db_conn() as conn: row = conn.execute('SELECT title FROM movies WHERE movieId=?', (movie_id, )).fetchone() if not row: raise ValueError(f'Movie ID {movie_id} not in database.') return row[0]
def unrated_movies(user_id): """Yield the ID of each movie the given user hasn't rated. :param user_id: A user ID. :return: A generator that yields movie IDs. """ with common.get_db_conn() as conn: for row in conn.execute( """ SELECT DISTINCT movieId FROM movies WHERE movieId NOT IN (SELECT DISTINCT movieId FROM ratings WHERE userId=?) """, (user_id, )): yield row[0]
def rating(user_id, movie_id): """Get the rating that the given user gave to the given movie. :param user_id: A user ID. :param movie_id: A movie ID. :return: A movie rating. (A float.) """ with common.get_db_conn() as conn: ratings = tuple(row[0] for row in conn.execute( 'SELECT rating FROM ratings WHERE userId=? and movieId=?', ( user_id, movie_id))) assert len(ratings) == 1 return ratings[0]
def avg_user_rating(user): """Calculate the average of a user's movie ratings. :param user: A user ID. The user whose average ratings are being computed. :return: A value such as 3.5. """ with common.get_db_conn() as conn: return conn.execute( """ SELECT AVG(rating) FROM ratings WHERE userId=? """, (user, )).fetchone()[0]
def cpop_db(dataset): """Create and populate a new database. More specifically: * Create database tables for the datasets. * Create database tables for calculated data. (i.e. Create a table which maps userId → predictorName.) * Populate the dataset tables. :param dataset: The dataset to populate the new database with. Use one of the keys from :data:`movie_recommender.constants.DATASETS`. :return: Nothing :raises DatabaseAlreadyExistsError: If the target database already exists. :raises DatasetAbsentError: If the referenced dataset isn't installed. """ # Check whether a conflicting database exists. save_path = Path(common.get_save_path()) if save_path.exists(): raise exceptions.DatasetAbsentError( "Can't create a new database, as a file already exists at: {}". format(save_path), ) # Check whether the dataset is installed or not. installed_datasets = datasets.get_installed_datasets() if dataset not in installed_datasets: raise exceptions.DatabaseAlreadyExistsError( "Can't create a database from the {} dataset, as it isn't " 'installed.'.format(dataset)) # Create and populate a new database. with common.get_db_conn(save_path) as conn: cpop_links_table( conn, Path(installed_datasets[dataset], 'links.csv'), ) cpop_movies_table( conn, Path(installed_datasets[dataset], 'movies.csv'), ) cpop_ratings_table( conn, Path(installed_datasets[dataset], 'ratings.csv'), ) cpop_tags_table( conn, Path(installed_datasets[dataset], 'tags.csv'), ) c_predictors_table(conn) c_similarities_table(conn) c_avg_ratings_table(conn)
def rating_pairs(movie_a, movie_b): """Yield pairs of ratings for the given movies. Logically, this function generates a table in the following form. Notice that the table is perfectly dense. ==== ======= ======= User Movie A Movie B ==== ======= ======= 1 1.0 0.5 26 4.0 1.5 2 5.0 0.5 ==== ======= ======= :param movie_a: A movie ID. A movie to get ratings for. :param movie_b: A movie ID. A movie to get ratings for. :rtype movie_recommender.db.RatingPair: :return: A generator that yields every pair of ratings for the two given movies. :raise: ``ValueError`` if ``movie_a`` and ``movie_b`` are equal. """ if movie_a == movie_b: raise ValueError(f""" Fetching pairs of ratings for a movie and itself is disallowed. Movie IDs: {movie_a}, {movie_b} """) movies = [movie_a, movie_b] movies.sort() with common.get_db_conn() as conn: for row in conn.execute( """ SELECT movieARatings.userId, movieARatings.rating, movieBRatings.rating FROM ( SELECT userId, rating FROM ratings WHERE movieId = ? ) movieARatings INNER JOIN ( SELECT userId, rating FROM ratings WHERE movieId = ? ) movieBRatings WHERE movieARatings.userId = movieBRatings.userId """, movies): yield common.RatingPair(row[0], row[1], row[2])
def make_genre_predictor(genre, user_id, forbidden_movie=None): """Make a predictor for the given genre for the given user. :param genre: A genre name, as a string. :param user_id: A user ID. The user for which a predictor is being created. :param forbidden_movie: A movie ID. A movie to ignore when creating the predictor. :return: A function which accepts a movie ID and returns a predicted rating. """ query = """ SELECT movies.genres, ratings.rating FROM movies JOIN ratings USING (movieId) WHERE ratings.userId == ? """ params = [user_id] if forbidden_movie: query += 'AND movieId != ?' params.append(forbidden_movie) # Iterate through movies this user has rated. For each movie, create a # Cartesian point, where X is whether the move has the given genre, and Y # is the rating this user has given to this movie. points = [] with common.get_db_conn() as conn: for row in conn.execute(query, params): genres = row[0].split('|') genre_present = 1 if genre in genres else 0 rating = row[1] points.append(Point(genre_present, rating)) graph = Graph(points) def predictor(movie_id): """Predict a user's rating for the given movie. :param movie_id: A movie ID. :return: A predicted rating for the given movie. """ genres = read.genres(movie_id) genre_present = 1 if genre in genres else 0 try: rating = graph.predict_y(genre_present) except exceptions.VerticalLineOfBestFitGraphError: rating = graph.avg_point.y return clamp_rating(rating) return predictor
def avg_rating(user_id): """Get the average of a user's ratings, from the avgRatings table. :param user_id: A user ID. The user whose average rating is being fetched. :return: An average rating, such as 3.5. """ with common.get_db_conn() as conn: row = conn.execute( """ SELECT avgRating FROM avgRatings WHERE userId=? """, (user_id, ), ).fetchone() if not row: raise exceptions.MissingAverageRatingError( f'No average rating for user {user_id} has been calculated.') return row[0]
def predictor_name(user_id): """Get the personalized predictor name for the given user. :param user_id: A user ID. The user for which the personalized predictor name is being fetched. :return: The name of the personalized predictor for the given user. :raise movie_recommender.exceptions.NoPersonalizedPredictorError: If the given user doesn't have a personalized predictor. """ with common.get_db_conn() as conn: row = conn.execute('SELECT predictor FROM predictors WHERE userId=?', (user_id, )).fetchone() if not row: raise exceptions.NoPersonalizedPredictorError( f'User {user_id} has no personalized predictors. Please generate ' 'one with "mr-analyze".', ) return row[0]
def similarities(similarities_): """Write movies similarity scores to the database. :param similarities_: An iterable of :class:`movie_recommender.db.common.Similarity` objects. """ # SQLite added support for UPSERT in version 3.24.0, which was released on # 2018-06-24. See: https://www.sqlite.org/lang_UPSERT.html with common.get_db_conn() as conn: with conn: conn.executemany( """ INSERT INTO similarities VALUES (?, ?, ?) ON CONFLICT (movieAId, movieBId) DO UPDATE SET similarity=? """, _similarities_values(similarities_), )
def to_user_id(arg): """Cast the given string argument to a user ID, if possible. An exception of some kind is raised if ``arg`` can't be cast to a user ID. The specific type of exception varies. :param arg: A string argument passed on the command line. Semantically, a user ID. :return: A user ID. """ user_id = int(arg) with common.get_db_conn() as conn: user_ids = tuple(row[0] for row in conn.execute( 'SELECT DISTINCT userId FROM ratings WHERE userId=?', (user_id, ))) if user_id not in user_ids: raise ValueError(f'User ID {user_id} not in database.') return user_id
def avg_ratings(avg_ratings_): """Write user average ratings to the database. :param avg_ratings_: An iterable of :class:`movie_recommender.db.common.AvgRating` objects. """ with common.get_db_conn() as conn: with conn: values = (avg_rating + (avg_rating.avg_rating, ) for avg_rating in avg_ratings_) conn.executemany( """ INSERT INTO avgRatings VALUES (?, ?) ON CONFLICT (userId) DO UPDATE SET avgRating=? """, values, )
def rated_movies(user_ids): """Get the IDs of the movies the given users have rated. :param user_ids: An iterable of user IDs. :return: A set of movie IDs. """ with common.get_db_conn() as conn: return { row[0] for row in conn.execute( f""" SELECT DISTINCT movieId FROM ratings WHERE userId IN ({ ', '.join('?' for _ in range(len(user_ids))) }) """, tuple(user_ids), ) }
def avg_movie_rating(movie): """Calculate the average of a movie's ratings. :param movie: A movie ID. The movie whose average rating is being computed. :return: A value such as 3.5. :raise movie_recommender.exceptions.NoMovieRatingsError: If an average can't be calculated due to a lack of ratings. """ with common.get_db_conn() as conn: avg = conn.execute( """ SELECT AVG(rating) FROM ratings WHERE movieId=? """, (movie, )).fetchone()[0] if avg is None: raise exceptions.NoMovieRatingsError( f"Can't calculate the average rating for movie {movie}, as no " 'ratings have been assigned to it.') return avg
def make_year_predictor(user_id, forbidden_movie=None): """Make a year-based predictor for the given user. If year information can't be extracted from a movie's title, then that movie is skipped when generating a predictor. This is done because so few movies have this issue. See: `class:`movie_recommender.exceptions.NoMovieYearError`. :param user_id: A user ID. The user for which a predictor is being created. :param forbidden_movie: A movie ID. A movie to ignore when creating the predictor. :return: A function which accepts a movie ID and returns a predicted rating. """ query = """ SELECT movies.title, ratings.rating FROM movies JOIN ratings USING (movieId) WHERE ratings.userId == ? """ params = [user_id] if forbidden_movie: query += 'AND movieId != ?' params.append(forbidden_movie) # Iterate through movies this user has rated. For each movie, create a # Cartesian point, where X is the movie's year, and Y is the rating this # user has given to this movie. points = [] with common.get_db_conn() as conn: for row in conn.execute(query, params): try: year = read.year(row[0]) except exceptions.NoMovieYearError: continue rating = row[1] points.append(Point(year, rating)) graph = Graph(points) def predictor(movie_id): """Predict a user's rating for the given movie. :param movie_id: A movie ID. :return: A predicted rating for the given movie. :raise movie_recommender.exceptions.NoMovieYearError: If the given movie's title doesn't include a year, and this predictor makes use of year data. :raise movie_recommender.exceptions.EmptyGraphError: If this predictor can't predict movie ratings at all, due to a lack of relevant data. For example, this will occur if the given movie's title does include a year, but all of the movies this user has rated lack a year. """ title = read.title(movie_id) year = read.year(title) try: rating = graph.predict_y(year) except exceptions.VerticalLineOfBestFitGraphError: rating = graph.avg_point.y return clamp_rating(rating) return predictor