def GetPredictions(file=None): results = [] # So I don't have to re-do a lot of predictions if I start and stop records_to_skip = 0 if file is not None: results = pd.read_csv(file).to_dict('records') records_to_skip = len(results) count = 0 total_count = req_reviews.shape[0] t = Timer() t.Start() for row in req_reviews.iterrows(): count += 1 if count > records_to_skip: # in the event we're continuing a file, jump to the last record predicted = PredictReview(row[1].reviewerID, row[1].asin) results.append({"datapointID":row[1].datapointID,"overall":predicted}) if count % 1000 == 0: # informative prints so we know it's still working t.Stop() super_print("({} of {}) ({:.4f}s/prediction)".format(count, total_count, t.elapsed/1000)) t.Start() DataFrame(results).to_csv("output.csv", index=False) DataFrame(results).to_csv("output.csv", index=False)
def Load_MovieData(): print("Loading Movie data from CSV...") t = Timer() t.Start() # Create a temporary table, since the dataset has duplicate IDs that violate # the Primary Key Constraint of FilmId. SQLite doesn't have an ADD CONSTRAINT # so we make an identical table without the constraint, fill it with data, # then copy that data line by line to the new table statement = ''' CREATE TABLE "Film_temp" ( 'FilmID' INTEGER, 'Title' TEXT, 'Release' TEXT, 'Budget' INTEGER, 'Revenue' INTEGER, 'Runtime' INTEGER, 'Rating' TEXT, 'Poster' TEXT, 'Rating_IMDB' INTEGER, 'Rating_RT' INTEGER, 'Rating_MC' INTEGER, 'BestPicture' INTEGER, 'AA_Wins' INTEGER, 'AA_Nominations' INTEGERS ); ''' cur.execute(statement) for f in pd.read_csv(MOVIEMETADATA_CSV, iterator=True): inserts = [] for row in f.itertuples(): inserts.append([ row[6], row[9], row[15], row[3], row[16], row[17], ]) statement = 'INSERT INTO Film_temp VALUES (?,?,?,?,?,?,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL)' cur.executemany(statement, inserts) # the CSV contains duplicate entries for 29 films - remove them here statement = ''' DELETE FROM Film_temp WHERE FilmID IN (SELECT MIN(FilmID) FROM Film_temp GROUP BY FilmID HAVING COUNT(*) > 1) ''' cur.execute(statement) # copy the entirety of the temp table to the actual Film table, which has the PK constraint cur.execute("SELECT * FROM Film_temp") inserts = [] statement = 'INSERT INTO Film VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)' for row in cur: inserts.append(row) cur.executemany(statement, inserts) t.Stop() print("Movie Data loaded in " + str(t))
def Create_NPZ(file, output_file): reviews = pd.read_json(file, lines=True) reviews.sort_values(['reviewerID', 'asin'], ascending=[True, True], inplace=True) # create two DataFrames to act as indexes for the matrix # the index of the movie dataframe is the column number # the index of the users dataframe is the row number # example: # users[users.userID=="A01174011QPNX7GZF4B92"].index.values[0] returns 7 # movies[movies.asin=="6300248135"].index.values[0] returns 9 # the value of m_reviews[7,9] is 5 movies = DataFrame(data=reviews.asin.unique(), columns=["asin"]) users = DataFrame(data=reviews.reviewerID.unique(), columns=["userID"]) # initialize a new lil matrix, with size (users,movies) m_reviews = lil_matrix((len(users), len(movies)), dtype=np.int8) t = Timer() t.Start() count = 0 total = reviews.shape[0] # iterate through all rows in the reviews file I've loaded in for row in reviews.iterrows(): count += 1 # grab the user and movie IDs from the dictionaries I made m_row_ID = users[users.userID==row[1].reviewerID].index.values[0] m_col_ID = movies[movies.asin==row[1].asin].index.values[0] m_value = row[1].overall # assign the rating value to the matrix coordinate [user,movie] m_reviews[m_row_ID, m_col_ID] = m_value # just so I know it's still working if count % 1000 == 0: sys.stdout.write("{} of {} ({} remaining)...\n".format(count, total, (total-count))) sys.stdout.flush() # save the user->userid and movie->movieid dictionaries to files, because I've lost them twice already movies.to_json(output_file+"movies_df.json",orient='records', lines=True) users.to_json(output_file+"users_df.json",orient='records', lines=True) # convert to a coo matrix so we can save it m_reviews = coo_matrix(m_reviews) # save to file scipy.sparse.save_npz(output_file + ".npz", m_reviews) t.Stop() print("Completed in ",t)
def Load_Ratings(): print("Loading Ratings data from CSV...") t = Timer() t.Start() chunksize = 100000 i = 0 for f in pd.read_csv(RATINGS_CSV, chunksize=chunksize, iterator=True): inserts = [] for row in f.itertuples(): inserts.append(row[1:]) statement = 'INSERT INTO Ratings VALUES (?,?,?,?)' cur.executemany(statement, inserts) conn.commit() i += 1 sys.stdout.write("loading chunk #{}...\n".format(str(i))) sys.stdout.flush() t.Stop() print("Ratings Loaded in " + str(t))
def Load_Credits(): print("Loading Film Credits from CSV...") t = Timer() t.Start() chunksize = 30 i = 0 sys.stdout.write("loading chunks.") for f in pd.read_csv(CREDITS_CSV, chunksize=chunksize, iterator=True): inserts = [] for row in f.itertuples(): movieId = row[3] pattern = r'{.*?}' # pull strings out that are inside brackts for cast in re.findall(pattern, row[1]): try: cast_json = CleanJSONString(cast) AddPersonToDB(movieId, cast_json['name'], "Cast") except Exception as e: with open("errors.txt", 'a', encoding="utf8") as f: f.write("ERR: " + str(e) + "\n") f.write(cast + "\n\n") pass for crew in re.findall(pattern, row[2]): try: crew_json = CleanJSONString(crew) AddPersonToDB(movieId, crew_json['name'], crew_json['job']) except Exception as e: with open("errors.txt", 'a', encoding="utf8") as f: f.write("ERR: " + str(e) + "\n") f.write(crew + "\n\n") pass i += 1 # sys.stdout.write("loading chunk #{} of 25...\n".format(str(i))) sys.stdout.write(".") sys.stdout.flush() # if i == 10: # break t.Stop() print() print("Credits Loaded in " + str(t))
def AAwardWinningFilms(): t = Timer() t.Start() url = 'https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films' AA_Cache = CacheFile('WikipediaCache.json', print_info=True) AA_Soup = AA_Cache.CheckCache_Soup(url, strainer=SoupStrainer(class_="wikitable")) Films = [] for row in AA_Soup.find_all("tr"): if "Nominations" in row.text: pass else: cols = row.find_all("td") f = FilmAcademyAward() f.title = cols[0].text f.year = cols[1].text.split('/')[0] try: f.BestPicture = ("#EEDD82" in row.attrs['style']) except: pass f.Awards = tryParseInt(cols[2].text.split(' ')[0]) f.Nominations = tryParseInt(cols[3].text) Films.append(f) conn = sqlite3.connect(Database_Name) cur = conn.cursor() inserts = [] for film in Films: inserts.append(film.InsertTuple()) statement = ''' UPDATE Film SET BestPicture=?,AA_Wins=?,AA_Nominations=? WHERE Title == ? AND Release LIKE ? ''' cur.executemany(statement,inserts) conn.commit() conn.close() t.Stop() print("Scraping Completed in " + str(t))
def ResetDatabase(): try: t = Timer() t.Start() global cur conn = sqlite.connect(DATABASE_NAME) cur = conn.cursor() ResetTable("Film") # will also reset ratings when it's done ResetTable("Credits") conn.commit() t.Stop() print("Database Reset in " + str(t)) except sqlite.OperationalError as e: if str(e) == "database is locked": print(DATABASE_NAME + " has pending changes. Write those changes and restart") else: print("Database ERROR: " + str(e)) print(type(e)) except Exception as e: print("ERROR: " + str(e)) print(type(e))
def InitializeOMDBImport(): t = Timer() t.Start() print("Loading data from OMDB API...") conn = sqlite3.connect(Database_Name) cur = conn.cursor() cur2 = conn.cursor() # get ratings for the most popular, most highly rated films, and any film that # has won at least 2 academy awards statement = ''' SELECT Title, Release FROM Film WHERE FilmID IN ( SELECT MovieID FROM Ratings GROUP BY MovieID HAVING COUNT(*) > 10 ORDER BY AVG(Rating) LIMIT 350 ) OR FilmID IN ( SELECT MovieID FROM Ratings GROUP BY MovieID ORDER BY COUNT(*) DESC LIMIT 500 ) OR FilmID IN ( SELECT FilmID FROM Film WHERE AA_Wins > 1 ) ''' cur.execute(statement) updates = [] for row in cur: try: OMD_data = Import_OMD(row[0], row[1][:4]) values = [None, None, None, None, None, row[0], row[1]] values[0] = OMD_data['Rated'] values[1] = OMD_data['Poster'] for ratings in OMD_data['Ratings']: if ratings['Source'] == "Internet Movie Database": values[2] = ratings['Value'].split('/')[0] if ratings['Source'] == "Rotten Tomatoes": values[3] = ratings['Value'] if ratings['Source'] == "Metacritic": values[4] = ratings['Value'].split('/')[0] updates.append(values) except Exception as e: pass statement = 'UPDATE Film SET Rating=?, Poster=?, Rating_IMDB = ?, Rating_RT=?, Rating_MC=? WHERE Title == ? AND Release == ?' cur.executemany(statement, updates) conn.commit() conn.close() t.Stop() print("OMDB Import completed in " + str(t))