def get_imdb_list_page_link():
    class PageLink(tool.MovieMetadata):
        def __init__(self, link):
            self.link = link

    driver = utils.get_driver()

    pre_link = "https://www.imdb.com/search/title/?release_date=2010-01-01,2020-12-31&runtime=60,300&start=9951&ref_=adv_nxt"
    pre_link = "https://www.imdb.com/search/title/?release_date=2000-01-01,2009-12-31&runtime=60,300"

    try:
        pre_df = pandas.read_csv(pathmng.imdb_next_link_path)
        pre_link = list(pre_df["link"])[-1]
    except:
        pass
    link_list = []

    driver.get(pre_link)
    import time
    for i in range(1, 1000):
        link = driver.find_element_by_xpath(
            '//*[@class="lister-page-next next-page"]').get_attribute("href")
        link_list.append(PageLink(link))
        driver.find_element_by_xpath(
            '//*[@class="lister-page-next next-page"]').click()
        time.sleep(2)
        if i % 50 == 0:
            tool.save_metadata_to_csv(
                utils.filter_duplicate_preserve_order(link_list),
                pathmng.imdb_next_link_path)
            link_list.clear()
def get_Academy_Award_for_Best_Actor_Director():
    driver = utils.get_driver()
    urls = ["https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor",
            "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Director"]

    data = []
    for url in urls[0:2]:
        data.extend(get_academy_award_for_best_people(driver, url))

    tool.save_metadata_to_csv(data, pathmng.wiki_best_actor_director_path)
def get_movie_data():
    movie_metadatas = []

    def crawl_movie_and_append(movie_id):
        try:
            print("Getting for", movie_id, "at", len(movie_metadatas))
            metadata = MovieScraper(
                movie_url=f"https://www.rottentomatoes.com/m/{movie_id}"
            ).extract_metadata().metadata
            movie_metadatas.append(metadata)
        except Exception as e:
            print(e)

    with open(movie_id_path, "r") as f:
        all_movie_id = f.read().split("\n")[0:10000]
        print(len(all_movie_id))
        with ThreadPoolExecutor(max_workers=20) as e:
            for movie_id in all_movie_id:
                e.submit(crawl_movie_and_append, movie_id)
        # for movie_id in all_movie_id:
        #     crawl_movie_and_append(movie_id)

    tool.save_metadata_to_csv(movie_metadatas, movie_metadata_path)
        time.sleep(2)
        if i % 50 == 0:
            tool.save_metadata_to_csv(
                utils.filter_duplicate_preserve_order(link_list),
                pathmng.imdb_next_link_path)
            link_list.clear()


if __name__ == '__main__':
    movie_metadata_list = []

    for year in range(1980, 1999):
        for i in range(1, 2000, 50):
            string_url = f"https://www.imdb.com/search/title/?release_date={year}-01-01,{year}-12-31&runtime=60,300&start=" + str(
                i)
            print("crawled...", i + 50, "movies")
            res = parse_data(string_url)
            movie_metadata_list.extend(res)
        tool.save_metadata_to_csv(movie_metadata_list, imdb_movie_path)
        movie_metadata_list.clear()

    # # crawl bắt đầu từ title 10001
    # for index, link in enumerate(list(pandas.read_csv(pathmng.imdb_next_link_path)["link"])):
    #     print("crawled...", (index+1) * 50, "movies" )
    #     res = parse_data(link)
    #     movie_metadata_list.extend(res)
    #
    #     if index % 50 == 49:
    #         tool.save_metadata_to_csv(movie_metadata_list, imdb_movie_path)
    #         movie_metadata_list.clear()