def data_initialization(self):
        logging.debug(
            f'[{MovieRecommendationItemRating.data_initialization.__name__}] - start of function'
        )
        df_movies, df_ratings = self.read_files()
        combine_movie_rating = pd.merge(df_ratings, df_movies, on='movieId')
        combine_movie_rating = combine_movie_rating.dropna(axis=0,
                                                           subset=['title'])
        movie_ratingCount = (combine_movie_rating.groupby(
            by=['title'])['rating'].count().reset_index().rename(
                columns={'rating': 'totalRatingCount'})[[
                    'title', 'totalRatingCount'
                ]])
        rating_with_totalRatingCount = combine_movie_rating.merge(
            movie_ratingCount, left_on='title', right_on='title', how='left')
        user_rating = rating_with_totalRatingCount.drop_duplicates(
            ['userId', 'title'])
        movie_user_rating_pivot = pd.pivot_table(user_rating,
                                                 index='userId',
                                                 columns='title',
                                                 values='rating').fillna(0)
        X = movie_user_rating_pivot.values.T

        # calculating correlation matrix i.e.model
        SVD = TruncatedSVD(n_components=12, random_state=17)
        matrix = SVD.fit_transform(X)
        corr = np.corrcoef(matrix)
        movie_title = movie_user_rating_pivot.columns
        return movie_title, corr, df_movies
Exemplo n.º 2
0
def map_string_to_movie(selection_query):
    """
    It is necessary to match a string back to its original movie object so the id can be used for further provision of information
    @param selection_query: Matches the string of an existing movie back to the id
    @return: A movie object with the ID and title of the movie
    """

    # Split the provided string that was used for the matching
    # It contains the score which is not needed anymore at this point
    # split_movie_title = selectionQuery.split('\'')
    # Extract the actual title
    # movie_title = split_movie_title[1]
    logging.debug(
        f'[{map_string_to_movie.__name__}] - start of function with selection query: {selection_query}'
    )
    movie_title = selection_query
    PATH = os.path.join(MOVIELENS_ROOT, 'movies.csv')
    df_movies: pd.DataFrame = pd.read_csv(PATH,
                                          encoding="UTF-8",
                                          usecols=[MOVIE_ID, TITLE],
                                          dtype={
                                              MOVIE_ID: 'int32',
                                              TITLE: 'str'
                                          })

    # Map the title back to the original movie to use its id
    movie_object = df_movies.loc[df_movies[TITLE] == movie_title]
    logging.debug(
        f'[{map_string_to_movie.__name__}] - Movie object: {movie_object}')

    return movie_object
    def get_similar_movies_based_on_genre(self, input_movie_title):
        """
        Returns a collection of similar movie titles based on the genre
        @param input_movie_title: the reference movie title for the recommendation
        @return: a collection of similar movies based on genre
        """
        logging.debug(
            f'[{self.get_similar_movies_based_on_genre.__name__}] - '
            f'start of function with movie title: {input_movie_title}')
        cosine_sim, movie_content_df_temp = self.data_initialization(
            'resources/movies.csv')
        # create a series of the movie id and title
        indices = pd.Series(movie_content_df_temp.index,
                            movie_content_df_temp['title'])
        movie_index = indices[input_movie_title]
        sim_scores = list(enumerate(cosine_sim[movie_index]))
        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar movies
        sim_scores = sim_scores[1:6]

        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]
        similar_movies = pd.DataFrame(
            movie_content_df_temp[['title']].iloc[movie_indices])
        similar_movies = similar_movies.reset_index()
        # similar_movies['score'] = movie_sim_scores
        similar_movies = similar_movies.to_dict()
        return similar_movies
Exemplo n.º 4
0
    def metadata_recommender_with_keywords(self, movie_id):
        """
        Metadata recommender based on keywords and genres
        :param movie_id: the id of the movie
        :return: list of movie ids, may be [], maximum length is 5
        """
        logging.debug(
            f'[{self.metadata_recommender_with_keywords.__name__}] - start function with movie id: {movie_id}'
        )
        if movie_id not in self.movie_metadata:
            return []
        genres = self.movie_metadata[movie_id][GENRES_COL]
        keywords = self.movie_metadata[movie_id][KEYWORDS_COL]

        movie_scores_ref = list()
        Recommender.add_score_to_list(genres, 2, movie_scores_ref)
        Recommender.add_score_to_list(keywords, 10, movie_scores_ref)

        movie_points_jaccard = dict()

        for key, movie in self.movie_metadata.items():
            if key == movie_id:
                continue
            movie_scores = list()
            Recommender.match_with_bias(movie[GENRES_COL], genres, 2, 0,
                                        movie_scores)
            Recommender.match_with_bias(movie[KEYWORDS_COL], keywords, 10, 5,
                                        movie_scores)

            movie_points_jaccard[key] = float(
                sm.jaccard_similarity(movie_scores_ref, movie_scores))
        recommendation = sorted(movie_points_jaccard,
                                key=lambda x: movie_points_jaccard[x],
                                reverse=True)
        return recommendation[:5]
    def get_similar_movies_based_on_tags(self, input_movie_title):
        """
        Matches the given movie title with the 10 most similar movies based on the content tags
        @param input_movie_title: the movie title for the recommendation
        @return: a collection of the 10 most similar movie titles based on tags
        """
        logging.debug(
            f'[{self.get_similar_movies_based_on_tags.__name__}] - start of function with title <{input_movie_title}>'
        )
        cosine_sim, movie_content_df_temp = self.read_model_content_data()
        # create a series of the movie id and title
        indices = pd.Series(movie_content_df_temp.index,
                            movie_content_df_temp['title'])

        movie_index = indices[input_movie_title]
        sim_scores = list(enumerate(cosine_sim[movie_index]))
        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the 5 most similar movies
        sim_scores = sim_scores[1:6]

        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]
        similar_movies = pd.DataFrame(
            movie_content_df_temp[['title']].iloc[movie_indices])
        similar_movies = similar_movies.reset_index()
        # similar_movies['score'] = movie_sim_scores
        return similar_movies.to_dict()
Exemplo n.º 6
0
def _get_views_dict(movie_collection: list, movie_dict: dict) -> dict:
    logging.debug(
        f'[{_get_views_dict.__name__}] - start to transform movie collection to dictionary'
    )
    if type(movie_collection) is list:
        _get_movie_dict(movie_collection, movie_dict)
    elif type(movie_collection) is dict:
        _df_to_movie_dict(movie_collection, movie_dict)
    return movie_dict
    def offline_initialization(self, movies_data, tags_data):
        logging.debug(
            f'[{self.offline_initialization.__name__}] - '
            f'start of function with movies_data <{movies_data}> and tags_data <{tags_data}>'
        )
        # reading the movies dataset
        movie_list = pd.read_csv(movies_data, encoding="Latin1")
        tag_list = pd.read_csv(tags_data, encoding="Latin1")

        movie_tags_list = ""
        for index, row in tag_list.iterrows():
            movie_tags_list += row.tag + "|"
        # split the string into a list of values
        tags_list_split = movie_tags_list.split('|')
        # de-duplicate values
        new_list = list(set(tags_list_split))
        # remove the value that is blank
        new_list.remove('')

        df = pd.DataFrame(columns={'movieId', 'tags'})
        for row in movie_list.iterrows():
            movie_id = row[1]['movieId']
            df_temp = tag_list.loc[tag_list['movieId'] == movie_id]
            tag_lst = ""
            for tag in df_temp.iterrows():
                tag_lst = tag_lst + str(tag[1]['tag']) + '|'
            df = df.append({
                'movieId': movie_id,
                'tags': tag_lst
            },
                           ignore_index=True)

        combine_movie_tags = pd.merge(movie_list, df, on='movieId')
        # Enriching the movies dataset by adding the various genres columns.
        movies_with_tags = combine_movie_tags.copy()

        # selection of 5000 tag features to prepare data for model
        for tg in new_list[:500]:
            movies_with_tags[tg] = movies_with_tags.apply(
                lambda _: int(tg in _.tags), axis=1)

        # Getting the movies list with only genres like Musical and other such columns
        movie_content_df_temp = movies_with_tags.copy()
        movie_content_df_temp.set_index('movieId')
        movie_content_df = movie_content_df_temp.drop(
            columns=['movieId', 'title', 'genres', 'tags'])
        movie_content_df = movie_content_df.values
        print(movie_content_df)

        # Compute the cosine similarity matrix
        cosine_sim = linear_kernel(movie_content_df, movie_content_df)
        # write model for offline initialization
        np.savetxt('model.txt', cosine_sim)
        # write Movie contents for runtime recommendations
        movie_content_df_temp.to_csv('movie_content.csv',
                                     index=True,
                                     header=True)
Exemplo n.º 8
0
def _remove_year_from_title(movie_title: str) -> str:
    """
    Removes the release year of the movie title given as parameter
    @param movie_title: the movie title, which will be modified
    @return: the movie title without the release year
    """
    logging.debug(f'[{_remove_year_from_title.__name__}] - start of function')
    year_regex = re.compile(YEAR_PATTERN)
    return_string: str = year_regex.subn('', movie_title)[0]
    logging.debug(f'[{_remove_year_from_title.__name__}] - return string: {return_string}')
    return return_string
Exemplo n.º 9
0
def _get_json_response(image_url: str) -> str:
    """
    Transforms the image_url in parameter to a json response
    @param image_url: the image url to be transformed
    @return: the json response of the given url (Note: can be None)
    """
    logging.debug(f'[{_get_json_response.__name__}] - start of function')
    response = requests.get(image_url)
    json_response = response.json()
    # poster url is in the json_response
    if POSTER in json_response:
        return json_response[POSTER]
    else:
        return None
Exemplo n.º 10
0
    def metadata_recommender(self, movie_id: int, bias=15):
        """
        Metadata recommender based on genres, language, actors, directors and keywords
        :param movie_id: the id of the movie
        :param bias: the scoring bias
        :return: list of movie ids, may be [], maximum length is 5
        """
        logging.debug(
            f'[{self.metadata_recommender.__name__}] - start function with movie id: {movie_id}'
        )
        if movie_id not in self.movie_metadata:
            return []
        genres = self.movie_metadata[movie_id][GENRES_COL]
        languages = self.movie_metadata[movie_id][LANGUAGES_COL]
        actors = self.movie_metadata[movie_id][ACTORS_COL]
        directors = self.movie_metadata[movie_id][DIRECTORS_COL]
        keywords = self.movie_metadata[movie_id][KEYWORDS_COL]

        movie_scores_ref = list()
        Recommender.add_score_to_list(genres, bias, movie_scores_ref)
        Recommender.add_score_to_list(languages, bias, movie_scores_ref)
        Recommender.add_score_to_list(actors, bias, movie_scores_ref)
        Recommender.add_score_to_list(directors, bias, movie_scores_ref)
        Recommender.add_score_to_list(keywords, bias, movie_scores_ref)

        movie_points_cosine = dict()

        for key, movie in self.movie_metadata.items():
            if key == movie_id:
                continue
            movie_scores = list()
            Recommender.match_with_bias(movie[GENRES_COL], genres, bias, 1,
                                        movie_scores)
            Recommender.match_with_bias(movie[LANGUAGES_COL], languages, bias,
                                        1, movie_scores)
            Recommender.match_with_bias(movie[ACTORS_COL], actors, bias, 1,
                                        movie_scores)
            Recommender.match_with_bias(movie[DIRECTORS_COL], directors, bias,
                                        1, movie_scores)
            Recommender.match_with_bias(movie[KEYWORDS_COL], keywords, bias, 1,
                                        movie_scores)

            movie_points_cosine[key] = float(
                sm.cosine_similarity(movie_scores_ref, movie_scores))
        recommendation = sorted(movie_points_cosine,
                                key=lambda x: movie_points_cosine[x],
                                reverse=True)
        return recommendation[:5]
 def get_similar_movies_based_on_itemRating(self, input_movie_title):
     logging.debug(
         f'[{MovieRecommendationItemRating.get_similar_movies_based_on_itemRating.__name__}] - '
         f'start of function with movie title <{input_movie_title}>')
     movie_title, model, df_movies = self.data_initialization(self)
     movie_title_list = list(movie_title)
     movie_index = movie_title_list.index(input_movie_title)
     sim_scores = list(enumerate(model[movie_index]))
     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
     sim_scores = sim_scores[1:6]
     movie_indices = [i[0] for i in sim_scores]
     similar_movies = pd.DataFrame()
     similar_movies['title'] = movie_title[movie_indices]
     logging.debug(
         f'[{MovieRecommendationItemRating.get_similar_movies_based_on_itemRating.__name__}] - item-rating executed'
     )
     return similar_movies.to_dict()
Exemplo n.º 12
0
def _title_to_image_url(movie_title: str, comma_check: bool = False) -> str:
    """
    Transforms the given movie title into an image url.
    Removes year and if comma_check is True, also transforms the movie title
    @param movie_title: movie title to be transformed into an image url
    @param comma_check: if True, the comma gets removed and the str part behind the comma appended to the beginning
                        of the movie title
    @return: an image url, where the poster of the movie can be found
    """
    logging.debug(f'[{_title_to_image_url.__name__}] - start of function')
    temp_string: str = _remove_year_from_title(movie_title)
    if comma_check:
        temp_string = _comma_check(temp_string)
    temp_string = temp_string.replace(BLANK, PLUS)
    image_url: str = random.choice(API_KEY_PREFIX) + temp_string
    logging.debug(f'[{_title_to_image_url.__name__}] - image url: {image_url}')
    return image_url
    def data_initialization(self, movie_dataset):
        logging.debug(
            f'[{self.data_initialization.__name__}] - start of function with movie dataset: {movie_dataset}'
        )
        # reading the movies dataset
        movie_list = pd.read_csv(movie_dataset, encoding="Latin1")

        genre_list = ""
        for index, row in movie_list.iterrows():
            genre_list += row.genres + "|"
        # split the string into a list of values
        genre_list_split = genre_list.split('|')
        # de-duplicate values
        new_list = list(set(genre_list_split))
        # remove the value that is blank
        new_list.remove('')
        # Enriching the movies dataset by adding the various genres columns.
        movies_with_genres = movie_list.copy()

        for genre in new_list:
            movies_with_genres[genre] = movies_with_genres.apply(
                lambda _: int(genre in _.genres), axis=1)

        # Getting the movies list with only genres like Musical and other such columns
        movie_content_df_temp = movies_with_genres.copy()
        movie_content_df_temp.set_index('movieId')
        movie_content_df = movie_content_df_temp.drop(
            columns=['movieId', 'title', 'genres'])
        movie_content_df = movie_content_df.values
        logging.debug(
            f'[{self.data_initialization.__name__}] - movie content dataframe: {movie_content_df}'
        )

        # Compute the cosine similarity matrix
        cosine_sim = linear_kernel(movie_content_df, movie_content_df)

        return cosine_sim, movie_content_df_temp
Exemplo n.º 14
0
def _comma_check(movie_title: str) -> str:
    """
    Checks if a comma separates the movie title, usually, if that is the case,
    the part after the comma must be appended to the beginning of the movie_title str
    @param movie_title: will be checked for a comma in the string
    @return: the comma corrected movie title if a comma is found in the string
    """
    logging.debug(f'[{_comma_check.__name__}] - start of function')
    if COMMA in movie_title:
        logging.debug(f'[{_comma_check.__name__}] - comma found in {movie_title}')
        temp_list = movie_title.split(COMMA)
        # remove all blanks before and after a word
        for index, value in enumerate(temp_list):
            temp_list[index] = value.strip()
        temp_str = temp_list.pop()
        temp_list.insert(0, temp_str)
        concatenated_str: str = BLANK.join(temp_list)
        return concatenated_str
    else:
        logging.debug(f'[{_comma_check.__name__}] - comma not found in {movie_title}')
        return movie_title
Exemplo n.º 15
0
def home(request):
    if request.method == 'POST':
        logging.debug(
            f'[{home.__name__}] - start with POST request: {request}')
        # Extract the search query from page
        data = request.POST.copy()
        # Query saved in Textfield
        search_query = data.get('movieTextField')
        logging.debug(f'[{home.__name__}] - search query: {search_query}')
        # Match the search_query and return the result
        results = match_strings(search_query)

        temp_dict = dict(results)

        for key in temp_dict.keys():
            temp_dict[key] = mp.get_image_url(key)

        return render(request, "index.html", {"results": temp_dict})

    # No search query yet entered
    logging.debug('No search query entered at the moment')
    return render(request, "index.html", {})
Exemplo n.º 16
0
def get_image_url(movie_title: str) -> str:
    """
    @param movie_title: the movie title for which an image url will be retrieved from OMDb
    @return: the image url, where the poster of the movie can be found. Note: Can be of type None.
    """

    if type(movie_title) is not str:
        logging.error(f'[{get_image_url.__name__}] - movie_title in parameter is not of type str')
        return None

    logging.debug(f'[{get_image_url.__name__}] - start function with movie title: {movie_title}')

    image_url: str = _title_to_image_url(movie_title=movie_title, comma_check=True)
    json_response = _get_json_response(image_url)

    # comma corrected movie_title did receive a response
    if json_response is not None:
        logging.debug(f'[{get_image_url.__name__}] - json response received with comma correction')
        return json_response
    # try without comma correction
    else:
        logging.debug(f'[{get_image_url.__name__}] - json response received without comma correction')
        image_url = _title_to_image_url(movie_title=movie_title, comma_check=False)
        return _get_json_response(image_url)
 def read_model_content_data(self):
     logging.debug(
         f'[{self.read_model_content_data.__name__}] - start of function')
     df = pd.read_csv('resources/movie_content.csv', encoding="utf-8")
     return model_data, df
 def read_files(self):
     logging.debug(f'[{self.read_files.__name__}] - reading csv files')
     df_movies = pd.read_csv('resources/movies.csv', encoding="Latin1")
     df_ratings = pd.read_csv('resources/ratings.csv',
                              usecols=['userId', 'movieId', 'rating'])
     return df_movies, df_ratings
Exemplo n.º 19
0
def recommendation(request):
    if request.method == 'POST':
        logging.debug(
            f'[{recommendation.__name__}] - start function with request: {request}'
        )
        # The movie title is the value of the selected submit button in the form
        selection_query = request.POST['submit']
        # Again needs to be mapped to the actual movie object as only the string is provided
        selection = map_string_to_movie(selection_query)
        # ID for different algorithms to work
        selection_id = selection.iloc[0][MOVIE_ID]
        # Title to show the user as the selected movie
        selection_title = selection.iloc[0][TITLE]

        print('selection id {}, selection title {}'.format(
            selection_id, selection_title))
        # Results of different algorithms
        rec = recommender.Recommender()

        movies_metadata: list = rec.metadata_recommender(selection_id)
        movies_keywords: list = rec.metadata_recommender_with_keywords(
            selection_id)

        rec_obj = MovieRecommendationItemRating()
        movies_item_rating = rec_obj.get_similar_movies_based_on_itemRating(
            rec_obj, selection_title)

        obj_rec = MovieRecommendationByGenre()
        movies_genres = obj_rec.get_similar_movies_based_on_genre(
            selection_title)

        obj = MovieRecommendationByTags()
        movies_tags = obj.get_similar_movies_based_on_tags(selection_title)

        selection_tuple: tuple = (selection_title,
                                  mp.get_image_url(selection_title))

        try:
            alg1: dict = dict()
            alg2: dict = dict()
            alg3: dict = dict()
            alg4: dict = dict()
            alg5: dict = dict()

            movieList = [
                movies_metadata, movies_keywords, movies_item_rating,
                movies_genres, movies_tags
            ]
            alg_list = [alg1, alg2, alg3, alg4, alg5]

            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=5) as executor:
                executor.map(_get_views_dict, movieList, alg_list)

            return render(
                request, "recommendations.html", {
                    "selection_title": selection_tuple,
                    "alg1": alg1,
                    "alg2": alg2,
                    "alg3": alg3,
                    "alg4": alg4,
                    "alg5": alg5
                })
        except Exception as excError:
            logging.error(
                f'An error occurred during the recommendation process with error: {excError}'
            )
            return render(request, "error.html", {"error": excError})