def execute(self, data): rt = RT.RT() movie = data["movie"] Ebert = False pred = False proba = 0 count, title, date_string, fresh_score, id_list = le.get_movie(rt, str(movie)) if count > 0: total, last_page, review_list = le.get_reviews(rt, id_list[0], fresh_score) entries, A = le.build_matrix(review_list, id_list, critic_list, fillzeros=True) if entries > 0: X_test, y_test = A[:, :-1], A[:, -1] Ebert = y_test[0] > 0 pred = RF.predict(X_test)[0] == 1 proba = RF.predict_proba(X_test)[0, 1] result = [] result.append({"Count": count}) result.append({"info": title + " " + date_string}) result.append({"EbertReviewed": Ebert}) result.append({"great": pred}) result.append({"prob": proba}) return result
def scrape_reviews(df, collection, starting_row=0): ''' Iterate over movie ids in dataframe and write reviews to a Mongo collection Dataframe will be updated in place with progress on the scraping Input/Output: df (Pandas Dataframe) collection (Mongo collection) starting_row - if restarting set to first unscraped row (int) ''' rt = RT.RT() for index, row in df[starting_row:].iterrows(): if index%50 == 0: print 'Scraping movie:', index m_id = df['movie_id'][index] m_fresh = df['freshness'][index] m_fresh = m_fresh.strip().strip('%') total_stated, final_page, review_list = le.get_reviews(rt, m_id, m_fresh) add_to_mongo(review_list, collection) total_items = len(review_list) if total_stated < total_items-1: print 'For', m_id, 'found', total_items, 'out of', total_stated df['total'][index] = total_stated df['found'][index] = total_items df['pages'][index] = final_page