def get_imdb_list_page_link(): class PageLink(tool.MovieMetadata): def __init__(self, link): self.link = link driver = utils.get_driver() pre_link = "https://www.imdb.com/search/title/?release_date=2010-01-01,2020-12-31&runtime=60,300&start=9951&ref_=adv_nxt" pre_link = "https://www.imdb.com/search/title/?release_date=2000-01-01,2009-12-31&runtime=60,300" try: pre_df = pandas.read_csv(pathmng.imdb_next_link_path) pre_link = list(pre_df["link"])[-1] except: pass link_list = [] driver.get(pre_link) import time for i in range(1, 1000): link = driver.find_element_by_xpath( '//*[@class="lister-page-next next-page"]').get_attribute("href") link_list.append(PageLink(link)) driver.find_element_by_xpath( '//*[@class="lister-page-next next-page"]').click() time.sleep(2) if i % 50 == 0: tool.save_metadata_to_csv( utils.filter_duplicate_preserve_order(link_list), pathmng.imdb_next_link_path) link_list.clear()
def get_Academy_Award_for_Best_Actor_Director(): driver = utils.get_driver() urls = ["https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor", "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Director"] data = [] for url in urls[0:2]: data.extend(get_academy_award_for_best_people(driver, url)) tool.save_metadata_to_csv(data, pathmng.wiki_best_actor_director_path)
def get_movie_data(): movie_metadatas = [] def crawl_movie_and_append(movie_id): try: print("Getting for", movie_id, "at", len(movie_metadatas)) metadata = MovieScraper( movie_url=f"https://www.rottentomatoes.com/m/{movie_id}" ).extract_metadata().metadata movie_metadatas.append(metadata) except Exception as e: print(e) with open(movie_id_path, "r") as f: all_movie_id = f.read().split("\n")[0:10000] print(len(all_movie_id)) with ThreadPoolExecutor(max_workers=20) as e: for movie_id in all_movie_id: e.submit(crawl_movie_and_append, movie_id) # for movie_id in all_movie_id: # crawl_movie_and_append(movie_id) tool.save_metadata_to_csv(movie_metadatas, movie_metadata_path)
time.sleep(2) if i % 50 == 0: tool.save_metadata_to_csv( utils.filter_duplicate_preserve_order(link_list), pathmng.imdb_next_link_path) link_list.clear() if __name__ == '__main__': movie_metadata_list = [] for year in range(1980, 1999): for i in range(1, 2000, 50): string_url = f"https://www.imdb.com/search/title/?release_date={year}-01-01,{year}-12-31&runtime=60,300&start=" + str( i) print("crawled...", i + 50, "movies") res = parse_data(string_url) movie_metadata_list.extend(res) tool.save_metadata_to_csv(movie_metadata_list, imdb_movie_path) movie_metadata_list.clear() # # crawl bắt đầu từ title 10001 # for index, link in enumerate(list(pandas.read_csv(pathmng.imdb_next_link_path)["link"])): # print("crawled...", (index+1) * 50, "movies" ) # res = parse_data(link) # movie_metadata_list.extend(res) # # if index % 50 == 49: # tool.save_metadata_to_csv(movie_metadata_list, imdb_movie_path) # movie_metadata_list.clear()