def fix_oscar_dates(): soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/List_of_Academy_Awards_ceremonies').text, 'lxml') table = soup.find('table', {'class': 'wikitable'}).find('tbody') table = table.findNext('table', {'class': 'wikitable'}).find('tbody') data = [] rows = table.find_all('tr') for row in rows: cols = row.find_all('td') cols = [ele.text.strip() for ele in cols] data.append([ele for ele in cols if ele]) # Get rid of empty values dates = [x[1] for x in data[1:] if len(x) >= 2] parsed_dates_dict = {d.year: d.date() for d in [dateparser.parse(date) for date in dates]} with session_scope() as session: for award in session.query(MovieAward).all(): award.award_date = parsed_dates_dict.get(award.award_date.year, award.award_date) with session_scope() as session: dates = set([x.award_date for x in session.query(MovieAward).all()])
def scrape_imdb_awards(): with session_scope() as session: for award in session.query(Award).all(): for year in range(award.start_year, award.end_year + 1): try: print(f"Scraping {award.award_name} {year}") SCRAPER_CLS = SCRAPER_CLS_DICT.get(award.award_id, IMDBAwardScraper) SCRAPER_CLS(award.award_id, year, award.award_name, award.date_timedelta).scrape() except Exception as e: print(f"ERROR, problem with {award.award_name} {year}, Exception: {e}") traceback.print_exc()
def scrape_oscar_movies(): with session_scope() as session: movies_to_scrape = list(session.query(Movie) \ .join(MovieAward, Movie.movie_wiki_url==MovieAward.movie_wiki_url) \ .filter(MovieAward.award_category==OSCARS_BEST_FILM) \ .distinct() \ .values(Movie.imdb_id, Movie.rt_url)) for imdb_id, rt_url in movies_to_scrape: if rt_url is not None: print(f"Start scraping {rt_url}, id: {imdb_id}") RTMovie(imdb_id, rt_url).scrape() print(f"Scraped {rt_url}")
def scrape(self, update_after=None): scrape_id = f"rt_{self.imdb_id}_critic_reviews" scrape_date = ScrapingLog.get_date(scrape_id) if not scrape_date or (update_after and scrape_date < update_after): soup = BeautifulSoup(rq.get(self.url).text, 'lxml') with session_scope() as session: movie = session.query(Movie).get(self.imdb_id) movie.rt_tomato_score = self.criticScore(soup) movie.rt_audience_score = self.audienceScore(soup) session.bulk_save_objects([movie]) critic_reviews = self.criticReviews() self.save_reviews(critic_reviews) ScrapingLog.add_log(scrape_id) else: print(f"Skip scraping {scrape_id}: Already scraped")
def save_results(self, results): with session_scope() as session: objects = [ MovieAward( award_id=self.award_id, award_category=award_category, movie_imdb_id=movie_imdb_id, award_name=self.award_name, person_imdb_id=person_imdb_id, person_name=person_name, winner=winner, award_date=self.award_date ) for award_category, movie_imdb_id, person_imdb_id, person_name, winner in results ] session.bulk_save_objects(objects)
def save_reviews(self, reviews): reviews = self.removeDuplicatedReviews(reviews) try: with session_scope() as session: session.bulk_save_objects([ RTReview(type=r.reviewer_type, movie_imdb_id=self.imdb_id, reviewer_url=r.reviewer_url, reviewer_name=r.reviewer_name, fresh=r.fresh, original_score=r.original_score, review_text=r.text, review_date=r.date) for r in reviews ]) except: print(f"Problem saving") for r in sorted(reviews, key=lambda x: (x.reviewer_type, x.reviewer_url, x.date)): print( f"{r.reviewer_url}, {r.date}, {r.reviewer_name}, {r.fresh}, {r.original_score} {r.text}" ) raise