def __init__(self): myDbHandler = DbHandler() keywords_list = myDbHandler.exec_select_sql_from_file( os.path.join(DATA_PATH, 'sql/select_genres_keywords.sql')) self.keywords_df = pd.DataFrame( keywords_list, columns=['tconst', 'genres', 'keywords']) print(self.keywords_df.head()) actors_list = myDbHandler.exec_select_sql_from_file( os.path.join(DATA_PATH, 'sql/select_actors.sql')) self.actors_df = pd.DataFrame(actors_list, columns=['tconst', 'PrimaryName']) print(self.actors_df.head()) self.soup_df = pd.DataFrame(columns=['tconst', 'soup'])
def test_execute_select_from_sql(self): handler = DbHandler() handler.connect() try: results = handler.exec_select_sql_from_file(os.path.join(DATA_PATH, "sql/select_actors.sql")) print([r['tconst'] for r in results][:2]) except Exception as e: assert False, e return
class Recommender: def __init__(self): self.dbhandler = DbHandler() self.dbhandler.connect() data = self.dbhandler.exec_select_sql_from_file( os.path.join(DATA_PATH, "sql/select_soup.sql")) self.df = pd.DataFrame(data=data, columns=['tconst', 'soup']) self.cosine_sim = None self.cosine_sim_csv_path = os.path.join(DATA_PATH, 'cosine_sim/cosine_sim.csv') return def create_cosine_sim(self) -> None: """ Creates the cosine_sim matrix from the self.df dataframe This is the method that does all the raw calculation, and this only needs to be done when the dataset changes or the cosine_csv.csv file is missing for some reason Otherwise it is better to just import the cosine_csv.csv file that is generated by an earlier use of this method :return: """ cv = CountVectorizer() count_matrix = cv.fit_transform(self.df['soup']) cosine_sim = cosine_similarity(count_matrix) tconst_array = self.df['tconst'].values cosine_sim_df = pd.DataFrame(data=cosine_sim) cosine_sim_df.insert(0, column='tconst', value=tconst_array) self.cosine_sim = cosine_sim_df return def export_cosine_sim_to_pkl(self, pkl_path=os.path.join( DATA_PATH, 'cosine_sim/cosine_sim.pkl')): """ Exports the self.cosine_sim dataframe to the specified path for later imports :param pkl_path: the path of the pkl file to export the dataframe :return: """ # Initialize the self.cosine_sim df self.create_cosine_sim() print(self.cosine_sim.head()) print("Starting to write...") start_time = time.time() self.cosine_sim.to_pickle(pkl_path) print("Finished writing...") end_time = time.time() print(f"Elapsed {end_time - start_time} s") return def import_cosine_sim_from_pkl(self, pkl_path=os.path.join( DATA_PATH, 'cosine_sim/cosine_sim.pkl'), auto_create=False): if not os.path.exists(pkl_path): if not auto_create: # Throw exception, we shouldn't create the file that doesn't exist print(f"Can't find file with path {pkl_path}, exiting") raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), pkl_path) else: # we should call export_cosine_sim_to_pkl to create the file self.create_cosine_sim() self.export_cosine_sim_to_pkl(pkl_path=pkl_path) self.cosine_sim = pd.read_pickle(pkl_path) return def get_tconst_from_idx(self, idx: int) -> str: return self.df[self.df.index == idx]["tconst"].values[0] def get_index_from_tconst(self, tconst) -> int: """ Returns the index of the row the tconst param corresponds to in the self.df dataframe Throws an Exception if the tconst value is not found :param tconst: str the tconst to look for in the df :return: int the index of the row found """ if not (tconst in self.cosine_sim['tconst'].values): raise Exception( f"tconst can't be found in tconst values of self.df, {tconst}") return self.cosine_sim[self.cosine_sim['tconst'] == tconst].index.values[0] def get_recommendation_from_tconst(self, tconst: str, limit=10) -> list: """ Assumes the self.cosine_sim is set (imported via import_cosine_sim_from_csv or generated via create_cosine_sim) :param tconst: str the tconst of the value we are looking for :param limit: int optional the number of results to be included, default is 10 :return: sorted_tconst: list of tconst id recommendations in a sorted order, starting from most similar """ tconst_idx = self.get_index_from_tconst( tconst) # get the index of the movie # print(tconst_idx) movie_recommendations = list(enumerate(self.cosine_sim[tconst_idx])) # sort the values by the similarity in desc order sorted_movie_recommendations = sorted(movie_recommendations, key=lambda x: x[1], reverse=True) sorted_tconst = [ self.get_tconst_from_idx(movie[0]) for movie in sorted_movie_recommendations[:limit] ] # print(sorted_tconst) return sorted_tconst def get_recommendation_titles_from_tconst(self, tconst: str, limit=10): return get_titles_from_tconst_list( self.get_recommendation_from_tconst(tconst, limit))