Exemplo n.º 1
0
 def __init__(self, project_path='/root/project/'):
     # 初始化Chrome
     self.driver, self.server, self.proxy = ChromeDriver().get_driver()
     self.project_path = project_path
     if not os.path.exists(project_path):
         os.makedirs(project_path)
     f = open(project_path + 'banner.json', 'w+')
     f.close()
 def __init__(self, project_path='/root/project/'):
     # 初始化Chrome
     self.driver, self.server, self.proxy = ChromeDriver().get_driver()
     self.project_path = project_path
     if not os.path.exists(project_path):
         os.makedirs(project_path)
     # self.driver.set_page_load_timeout(10)
     f = open(project_path + 'movie_detail.json', 'w+')
     f.close()
     file_dir = project_path + 'images/'
     if not os.path.exists(file_dir):
         os.makedirs(file_dir)
     else:
         shutil.rmtree(file_dir)
         os.makedirs(file_dir)
Exemplo n.º 3
0
 def start_crawl(self):
     limitation = self.get_limitation()
     for i in range(1, limitation):
         flag = self.get_movie_list(i)
         while not flag:
             print("quit chrome")
             self.driver.quit()
             self.server.stop()
             time.sleep(5)
             print("reopen chrome ")
             self.driver, self.server, self.proxy = ChromeDriver().get_driver()
             flag = self.get_movie_list(i)
     self.driver.quit()
     self.server.stop()
     time.sleep(5)
     self.rewrite_result()
Exemplo n.º 4
0
async def fetch_info(session, url):
    driver = ChromeDriver()
    await driver.process(url)

    video_xpath = '//*[@id="player"]/div[21]/video/source'
    show = EC.presence_of_element_located((By.XPATH, video_xpath))
    driver.wait.until(show)
    video_url = driver.driver.find_element_by_xpath(video_xpath).get_attribute('src')

    html = driver.driver.page_source
    info = re.findall('var flashvars =(.*?),\n', html)
    info_json = json.loads(info[0])

    duration = info_json.get('video_duration')
    title = info_json.get('video_title')
    image_url = info_json.get('image_url')
    link_url = info_json.get('link_url')
    quality_480p = info_json.get('quality_480p')

    parse_result = urlparse(video_url)
    file_path = parse_result.path
    await download_file(session, video_url, "./tmp/" + file_path)
 def start_crawl(self):
     url_list = self.load_file()
     count = 1
     time_start = time.time()
     for i in url_list:
         flag = self.get_movie_detail(i)
         while not flag:
             print("quit chrome", '=' * 50)
             self.driver.quit()
             print("driver.quit", '=' * 50)
             self.server.stop()
             print("server.stop", '=' * 50)
             time.sleep(10)
             print("reopen chrome ", '=' * 50)
             self.driver, self.server, self.proxy = ChromeDriver(
             ).get_driver()
             flag = self.get_movie_detail(i)
         print(count, i)
         count += 1
     self.save_as_json()
     self.driver.quit()
     self.server.stop()
     print('程序运行时间:', time.time() - time_start, '=' * 40)
     self.send_file(self.project_path)