def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called selenium = request.meta.get('Selenium') spider.browser.set_page_load_timeout(self.timeout) spider.browser.set_window_size(self.width, self.height) if selenium: try: spider.browser.get(request.url) body = wait.WebDriverWait.until( self=wait.WebDriverWait, method=ec.presence_of_element_located( (By.CLASS_NAME, "framework"))) except Exception as e: print "Exception is %s" % e return http.HtmlResponse(request.url, body=body) else: time.sleep(3) return http.HtmlResponse(request.url, body=body)
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called op = webdriver.ChromeOptions() op.add_argument('headless') prefs = {"profile.managed_default_content_settings.images": 2} op.add_experimental_option("prefs", prefs) browser = webdriver.Chrome(chrome_options=op) try: browser.get(request.url) browser.execute_script('onDownloadApk(0)') except: return None content = browser.page_source.encode('utf-8') url = browser.current_url browser.close() return http.HtmlResponse(url=url, encoding='utf-8', body=content, request=request)
def process_request(cls, request, spider): phantomjs_path = "D:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe" if request.meta.get('PhantomJS', False): headers = { # 'Accept': '*/*', # 'Accept-Language': 'en-US,en;q=0.8', # 'Cache-Control': 'max-age=0', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36', } dcap = dict(DesiredCapabilities.PHANTOMJS) for key, value in headers.items(): dcap['phantomjs.page.customHeaders.{}'.format(key)] = value driver = webdriver.PhantomJS(executable_path=phantomjs_path, desired_capabilities=dcap) driver.get(request.url) content = driver.page_source.encode('utf-8') driver.quit() return http.HtmlResponse(request.url, encoding='utf-8', body=content, request=request)
def process_request(cls, request, spider): #2.7 # if request.meta.has_key('PhantomJS'): #3 if 'PhantomJS' in request.meta: dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( 'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36' ) driver = webdriver.PhantomJS(executable_path="D:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe", desired_capabilities=dcap ) # http://car.autohome.com.cn/config/series/script/config-min.js?v=201709041522 # driver.set_page_load_timeout(10) # while True: # try: # driver.get(request.url) # time.sleep(1) # except: # pass # #driver.execute_script('window.stop()') # else: # break driver.get(request.url) content = driver.page_source.encode('utf-8') driver.quit() return http.HtmlResponse(request.url, encoding='utf-8', body=content, request=request)
def process_request(self, request, spider): driver.get(request.url) time.sleep(1) html = driver.page_source return http.HtmlResponse(url=request.url, body=html.encode('utf-8'), encoding='utf-8', request=request)
def process_request(self, request, spider): if request.meta.has_key('PhantomJS'): driver = webdriver.Chrome() driver.get(request.url) time.sleep(3) #停留3秒等待js加载 content = driver.page_source.encode('utf-8') driver.quit() return http.HtmlResponse(request.url, encoding='utf-8', body=content, request=request)
def process_request(self, request, spider): if request.meta.has_key('PhantomJS'): driver = webdriver.Chrome() driver.get(request.url) time.sleep(3) #停留3秒等待js加载 driver.execute_script("$('#buttonSelIndustry').click()") #点击选择行业按钮 content = driver.page_source.encode('utf-8') driver.quit() return http.HtmlResponse(request.url, encoding='utf-8', body=content, request=request)
def process_request(cls, request, spider): if request.meta.get('PhantomJS', False): url = request.url dcap = dict(DesiredCapabilities.PHANTOMJS) headers = { 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.8', 'Cache-Control': 'max-age=0', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36', 'Connection': 'keep-alive', 'Referer': 'http://www.baidu.com/', } for key, value in headers.items(): dcap['phantomjs.page.customHeaders.{}'.format(key)] = value dcap["phantomjs.page.settings.userAgent"] = ( 'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36' ) driver = webdriver.PhantomJS( executable_path= "D:\\work-path\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe", desired_capabilities=dcap) # driver = webdriver.Chrome('D:\\work-path\\chromedriver',desired_capabilities=dcap) driver.set_page_load_timeout(3) try: driver.get(request.url) except: driver.execute_script('window.stop()') content = driver.page_source.encode('utf-8') driver.quit() return http.HtmlResponse(url, status=200, encoding='utf-8', body=content, request=request) else: return None
def process_request(cls, request, spider): phantomjs_path = "D:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe" if request.meta.get('PhantomJS', False): dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36" ) driver = webdriver.PhantomJS(executable_path=phantomjs_path, desired_capabilities=dcap ) driver.get(request.url) content = driver.page_source.encode('utf-8') driver.quit() return http.HtmlResponse(request.url, encoding='utf-8', body=content, request=request)