def test_get_titles_from_tconst(self): """ Test the get_titles_from_tconst_list function Grab the recommendation titles for a random movie and crosscheck with the titles in the title_basics table :return: """ rec = Recommender() rec.import_cosine_sim_from_pkl() sample_tconst = random.choice(list( rec.cosine_sim.tconst.values)) # grab a random movie to test sample_recommendations_tconst = rec.get_recommendation_from_tconst( sample_tconst) sample_titles = get_titles_from_tconst_list( sample_recommendations_tconst) recommendation_titles = [result[1] for result in sample_titles] # Make sure the titles returned are a subset of the titles in title_basics table dbhandler = DbHandler() dbhandler.connect() # Use a join to limit the results only on those that interest us all_titles = dbhandler.conn.execute( text( "SELECT tconst,primaryTitle from title_basics NATURAL JOIN title_keywords" )) # Convert the result of the query to a df all_titles_df = pd.DataFrame(data=[row for row in all_titles], columns=['tconst', 'primaryTitle']) # Find the titles that exist in both the df from the db and the results from the recommender same_titles = all_titles_df.loc[all_titles_df['tconst'].isin( sample_recommendations_tconst)]['primaryTitle'] # Make sure we found all the movie titles assert len(recommendation_titles) == same_titles.size
def insert_to_db(self): myDbHandler = DbHandler() myDbHandler.connect() self.filtered_df.to_sql("title_basics", myDbHandler.conn, if_exists='append', index=False) return
def test_execute_select_from_sql(self): handler = DbHandler() handler.connect() try: results = handler.exec_select_sql_from_file(os.path.join(DATA_PATH, "sql/select_actors.sql")) print([r['tconst'] for r in results][:2]) except Exception as e: assert False, e return
def filter_foreign_keys(self): myDbHandler = DbHandler() myDbHandler.connect() nconst_ids = [ row["nconst"] for row in myDbHandler.conn.execute( text("SELECT DISTINCT nconst \ " "FROM title_principals")) ] self.filtered_df = self.filtered_df[self.filtered_df.nconst.isin( nconst_ids)] return
def insert_to_db(self): """ Insert the filtered_df in the db, tablename: title_ratings :return: """ myDbHandler = DbHandler() myDbHandler.connect() self.filtered_df.to_sql("title_ratings", myDbHandler.conn, if_exists='append', index=False) return
def test_execute_query(self): """ Run a sample query to make sure the connection is indeed correct and the schema is created :return: """ handler = DbHandler() handler.connect() try: handler.conn.execute(text("SELECT table_name FROM information_schema.tables \ WHERE table_schema = 'movie_recommender';")) except Exception as e: assert False, e
def filter_foreign_keys(self): myDbHandler = DbHandler() myDbHandler.connect() tconst_ids = [ row["tconst"] for row in myDbHandler.conn.execute( text("SELECT tconst FROM title_basics")) ] self.filtered_df = self.filtered_df[self.filtered_df.tconst.isin( tconst_ids)] self.filtered_df = self.filtered_df.groupby(['tconst', 'nconst' ]).size().reset_index() columns_to_keep = ['tconst', 'nconst'] self.filtered_df = self.filtered_df[columns_to_keep] return
def insert_to_db(self) -> None: """ Assuming the group_actors function is already called and the self.soup_df df is created, insert it in the title_soup table :return: """ myDbHandler = DbHandler() myDbHandler.connect() self.soup_df.to_sql("title_soup", myDbHandler.conn, if_exists='append', index=False) print("Successfully inserted values in the db") return
def filter_foreign_keys(self): """ Keep only the tconst ids that are in the title_basics table :return: """ myDbHandler = DbHandler() myDbHandler.connect() tconst_ids = [ row["tconst"] for row in myDbHandler.conn.execute( text("SELECT tconst FROM title_basics")) ] self.filtered_df = self.filtered_df[self.filtered_df.tconst.isin( tconst_ids)] return
def get_titles_from_tconst_list(tconst_list: list) -> list: """ :param tconst_list: list of tconst ids, normally generated via get_recommendation_from_tconst :return: list of tuples, where first field is the tconst and second field is the title from the title_basics table """ dbhandler = DbHandler() dbhandler.connect() all_titles = [ row for row in dbhandler.conn.execute( sql_text(f"SELECT tconst, primaryTitle FROM " f"title_basics NATURAL JOIN title_soup")) ] all_titles_df = pd.DataFrame(data=all_titles, columns=['tconst', 'primaryTitle']) return [(i, all_titles_df.loc[all_titles_df['tconst'] == i] ['primaryTitle'].values[0]) for i in tconst_list]
class Recommender: def __init__(self): self.dbhandler = DbHandler() self.dbhandler.connect() data = self.dbhandler.exec_select_sql_from_file( os.path.join(DATA_PATH, "sql/select_soup.sql")) self.df = pd.DataFrame(data=data, columns=['tconst', 'soup']) self.cosine_sim = None self.cosine_sim_csv_path = os.path.join(DATA_PATH, 'cosine_sim/cosine_sim.csv') return def create_cosine_sim(self) -> None: """ Creates the cosine_sim matrix from the self.df dataframe This is the method that does all the raw calculation, and this only needs to be done when the dataset changes or the cosine_csv.csv file is missing for some reason Otherwise it is better to just import the cosine_csv.csv file that is generated by an earlier use of this method :return: """ cv = CountVectorizer() count_matrix = cv.fit_transform(self.df['soup']) cosine_sim = cosine_similarity(count_matrix) tconst_array = self.df['tconst'].values cosine_sim_df = pd.DataFrame(data=cosine_sim) cosine_sim_df.insert(0, column='tconst', value=tconst_array) self.cosine_sim = cosine_sim_df return def export_cosine_sim_to_pkl(self, pkl_path=os.path.join( DATA_PATH, 'cosine_sim/cosine_sim.pkl')): """ Exports the self.cosine_sim dataframe to the specified path for later imports :param pkl_path: the path of the pkl file to export the dataframe :return: """ # Initialize the self.cosine_sim df self.create_cosine_sim() print(self.cosine_sim.head()) print("Starting to write...") start_time = time.time() self.cosine_sim.to_pickle(pkl_path) print("Finished writing...") end_time = time.time() print(f"Elapsed {end_time - start_time} s") return def import_cosine_sim_from_pkl(self, pkl_path=os.path.join( DATA_PATH, 'cosine_sim/cosine_sim.pkl'), auto_create=False): if not os.path.exists(pkl_path): if not auto_create: # Throw exception, we shouldn't create the file that doesn't exist print(f"Can't find file with path {pkl_path}, exiting") raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), pkl_path) else: # we should call export_cosine_sim_to_pkl to create the file self.create_cosine_sim() self.export_cosine_sim_to_pkl(pkl_path=pkl_path) self.cosine_sim = pd.read_pickle(pkl_path) return def get_tconst_from_idx(self, idx: int) -> str: return self.df[self.df.index == idx]["tconst"].values[0] def get_index_from_tconst(self, tconst) -> int: """ Returns the index of the row the tconst param corresponds to in the self.df dataframe Throws an Exception if the tconst value is not found :param tconst: str the tconst to look for in the df :return: int the index of the row found """ if not (tconst in self.cosine_sim['tconst'].values): raise Exception( f"tconst can't be found in tconst values of self.df, {tconst}") return self.cosine_sim[self.cosine_sim['tconst'] == tconst].index.values[0] def get_recommendation_from_tconst(self, tconst: str, limit=10) -> list: """ Assumes the self.cosine_sim is set (imported via import_cosine_sim_from_csv or generated via create_cosine_sim) :param tconst: str the tconst of the value we are looking for :param limit: int optional the number of results to be included, default is 10 :return: sorted_tconst: list of tconst id recommendations in a sorted order, starting from most similar """ tconst_idx = self.get_index_from_tconst( tconst) # get the index of the movie # print(tconst_idx) movie_recommendations = list(enumerate(self.cosine_sim[tconst_idx])) # sort the values by the similarity in desc order sorted_movie_recommendations = sorted(movie_recommendations, key=lambda x: x[1], reverse=True) sorted_tconst = [ self.get_tconst_from_idx(movie[0]) for movie in sorted_movie_recommendations[:limit] ] # print(sorted_tconst) return sorted_tconst def get_recommendation_titles_from_tconst(self, tconst: str, limit=10): return get_titles_from_tconst_list( self.get_recommendation_from_tconst(tconst, limit))
def test_handler_connection(self): handler = DbHandler() handler.connect() assert handler.conn is not None