def test_domain_allow(self): c = CookieJar(policy=DefaultCookiePolicy( blocked_domains=["acme.com"], allowed_domains=["www.acme.com"])) req = Request("http://acme.com/") headers = {"Set-Cookie": "CUSTOMER=WILE_E_COYOTE; path=/"} res = Response("http://acme.com/", headers=headers) c.extract_cookies(res, req) self.assertEquals(len(c), 0) req = Request("http://www.acme.com/") res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) self.assertEquals(len(c), 1) req = Request("http://www.coyote.com/") res = Response("http://www.coyote.com/", headers=headers) c.extract_cookies(res, req) self.assertEquals(len(c), 1) # set a cookie with non-allowed domain... req = Request("http://www.coyote.com/") res = Response("http://www.coyote.com/", headers=headers) cookies = c.make_cookies(res, req) c.set_cookie(cookies[0]) self.assertEquals(len(c), 2) # ... and check is doesn't get returned c.add_cookie_header(req) assert 'Cookie' not in req.headers
def test_empty_path(self): # Test for empty path # Broken web-server ORION/1.3.38 returns to the client response like # # Set-Cookie: JSESSIONID=ABCDERANDOM123; Path= # # ie. with Path set to nothing. # In this case, extract_cookies() must set cookie to / (root) c = CookieJar(DefaultCookiePolicy(rfc2965 = True)) headers = Headers({'Set-Cookie': 'JSESSIONID=ABCDERANDOM123; Path='}) req = Request("http://www.ants.com/") res = Response("http://www.ants.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.ants.com/") c.add_cookie_header(req) self.assertEquals(req.headers.get("Cookie"), "JSESSIONID=ABCDERANDOM123") self.assertEquals(req.headers.get("Cookie2"), '$Version="1"') # missing path in the request URI req = Request("http://www.ants.com:8080") c.add_cookie_header(req) self.assertEquals(req.headers.get("Cookie"), "JSESSIONID=ABCDERANDOM123") self.assertEquals(req.headers.get("Cookie2"), '$Version="1"')
def test_missing_final_slash(self): # Missing slash from request URL's abs_path should be assumed present. url = "http://www.acme.com" c = CookieJar(DefaultCookiePolicy(rfc2965=True)) interact_2965(c, url, "foo=bar; Version=1") req = Request(url) self.assertEquals(len(c), 1) c.add_cookie_header(req) self.assert_('Cookie' in req.headers)
def create_request(self, url, response=None, **kwargs): if response is not None: cookieJar = response.meta.setdefault('cookie_jar', CookieJar()) cookieJar.extract_cookies(response, response.request) else: cookieJar = CookieJar() kwargs.update(meta={'dont_merge_cookies': True, 'cookie_jar': cookieJar}) request = Request(url, **kwargs) cookieJar.add_cookie_header(request) return request
def test_add_cookie_header_clear_expired_cookies_based_on_check_frequency( self): jar = CookieJar(check_expired_frequency=2) jar.jar.clear_expired_cookies = Mock() request = Request("http://www.example.com/page.html", headers={"Content-Type": "text/html"}) jar.add_cookie_header(request) jar.jar.clear_expired_cookies.assert_not_called() jar.add_cookie_header(request) jar.jar.clear_expired_cookies.assert_called_once()
def create_request(self, url, response=None, **kwargs): # This function could be replaced by using CookiesMiddleware instead. if response is not None: cookieJar = response.meta.setdefault('cookie_jar', CookieJar()) cookieJar.extract_cookies(response, response.request) else: cookieJar = CookieJar() kwargs.update(meta={ 'dont_merge_cookies': True, 'cookie_jar': cookieJar }) request = Request(url, **kwargs) cookieJar.add_cookie_header(request) return request
def test_netscape_example_2(self): # Second Example transaction sequence: # # Assume all mappings from above have been cleared. # # Client receives: # # Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/ # # When client requests a URL in path "/" on this server, it sends: # # Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001 # # Client receives: # # Set-Cookie: PART_NUMBER=RIDING_ROCKET_0023; path=/ammo # # When client requests a URL in path "/ammo" on this server, it sends: # # Cookie: PART_NUMBER=RIDING_ROCKET_0023; PART_NUMBER=ROCKET_LAUNCHER_0001 # # NOTE: There are two name/value pairs named "PART_NUMBER" due to # the inheritance of the "/" mapping in addition to the "/ammo" mapping. c = CookieJar() headers = Headers({'Set-Cookie': 'PART_NUMBER=ROCKET_LAUNCHER_0001; path=/'}) req = Request("http://www.acme.com/") res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/") c.add_cookie_header(req) self.assertEquals(req.headers.get("Cookie"), "PART_NUMBER=ROCKET_LAUNCHER_0001") headers.appendlist("Set-Cookie", "PART_NUMBER=RIDING_ROCKET_0023; path=/ammo") res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/ammo") c.add_cookie_header(req) self.assert_(re.search(r"PART_NUMBER=RIDING_ROCKET_0023;\s*" "PART_NUMBER=ROCKET_LAUNCHER_0001", req.headers.get("Cookie")))
def testMalformedCookieHeaderParsing(self): headers = Headers({'Set-Cookie': [ 'CUSTOMER=WILE_E_COYOTE; path=/; expires=Wednesday, 09-Nov-2100 23:12:40 GMT', 'PART_NUMBER=ROCKET_LAUNCHER_0001; path=/', 'SHIPPING=FEDEX; path=/foo', 'COUNTRY=UY; path=/foo', 'GOOD_CUSTOMER;', 'NO_A_BOT;']}) res = Response('http://www.perlmeister.com/foo', headers=headers) req = Request('http://www.perlmeister.com/foo') c = CookieJar() c.extract_cookies(res, req) c.add_cookie_header(req) self.assertEquals(req.headers.get('Cookie'), 'COUNTRY=UY; SHIPPING=FEDEX; CUSTOMER=WILE_E_COYOTE; ' 'PART_NUMBER=ROCKET_LAUNCHER_0001; NO_A_BOT; GOOD_CUSTOMER')
def test_domain_block(self): pol = DefaultCookiePolicy( rfc2965=True, blocked_domains=[".acme.com"]) c = CookieJar(policy=pol) headers = {'Set-Cookie': 'CUSTOMER=WILE_E_COYOTE; path=/'} req = Request("http://www.acme.com/") res = Response('http://www.acme.com/', headers=headers) c.extract_cookies(res, req) self.assertEquals(len(c), 0) p = pol.set_blocked_domains(["acme.com"]) c.extract_cookies(res, req) self.assertEquals(len(c), 1) c.clear() req = Request("http://www.roadrunner.net/") res = Response("http://www.roadrunner.net/", headers=headers) c.extract_cookies(res, req) self.assertEquals(len(c), 1) req = Request("http://www.roadrunner.net/") c.add_cookie_header(req) assert 'Cookie' in req.headers and 'Cookie2' in req.headers c.clear() pol.set_blocked_domains([".acme.com"]) c.extract_cookies(res, req) self.assertEquals(len(c), 1) # set a cookie with blocked domain... req = Request("http://www.acme.com/") res = Response("http://www.acme.com/", headers=headers) cookies = c.make_cookies(res, req) c.set_cookie(cookies[0]) self.assertEquals(len(c), 2) # ... and check is doesn't get returned c.add_cookie_header(req) assert 'Cookie' not in req.headers
def test_netscape_misc(self): # Some additional Netscape cookies tests. c = CookieJar() headers = Headers() req = Request("http://foo.bar.acme.com/foo") # Netscape allows a host part that contains dots headers.appendlist("Set-Cookie", "Customer=WILE_E_COYOTE; domain=.acme.com") res = Response("http://www.acme.com/foo", headers=headers) c.extract_cookies(res, req) # and that the domain is the same as the host without adding a leading # dot to the domain. Should not quote even if strange chars are used # in the cookie value. headers.appendlist("Set-Cookie", "PART_NUMBER=3,4; domain=foo.bar.acme.com") res = Response("http://www.acme.com/foo", headers=headers) c.extract_cookies(res, req) req = Request("http://foo.bar.acme.com/foo") c.add_cookie_header(req) self.assert_( "PART_NUMBER=3,4" in req.headers.get("Cookie") and "Customer=WILE_E_COYOTE" in req.headers.get("Cookie"))
class zhihuCrawler(CrawlSpider): allowed_domains = ["www.zhihu.com"] host_url = "https://www.zhihu.com" start_urls = [ "https://www.zhihu.com" ] headers = { 'Connection': 'Keep-Alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': 'text / html, application / xhtml + xml, image / jxr, * / *', 'Accept-Language': 'zh-Hans-CN,zh-Hans;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586', 'Host': 'www.zhihu.com', 'Referer': 'https://www.zhihu.com/' # ' Cookie': '_za=5852b28b-399a-4bd8-8282-59070203151f; _xsrf=7d7cdde47226ee4e485a9cc9925f2715; __utmc=51854390; q_c1=73b48dcd9e84486f81814ea556dac319|1468220250000|1468220250000; l_cap_id=NjFiN2M2YzBmYmMwNDRmODk3ZGU3NTQ0ODllMzYyYzY=|1468827275|ccd88305461b2a3f2d9c38ec5c651e1bfcba81de; cap_id=ZGQ1MjFjMzM5MGI2NDY5ZmFjMGQ5NzMxODI2M2EzNWM=|1468827275|e44ef0b232dd85e4a62077a6a67e83ccbe963692; _zap=82d8c931-4ad6-464b-8e3f-2e430cce84e0; d_c0=AIBAeHJDNgqPTo5KKrizojLF6zLSb8c38qo=|1468220251; login=ZGNlMjUwYzNjNmMxNDI0N2I1YjQyMjVlMDM3YjMwN2Y=|1468827275|1c4c6a2dd0dec9d3948653906728e6ceb22154b2; __utma=51854390.1408714905.1468575990.1468819740.1468824510.5; __utmz=51854390.1468824510.5.4.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/topic/19552832/top-answers; __utmv=51854390.000--|2=registration_date=20130613=1^3=entry_date=20160711=1; __utmb=51854390.8.10.1468824510; n_c=1' } name = 'zhihu' _xsrf = '' cookiejar = CookieJar() driver = None login_cookies = None login_cookies_dict = None # handle_httpstatus_list = [302] # rules = ( # Rule(SgmlLinkExtractor(allow=(r'/question/\d+',)), follow=True), # Rule(SgmlLinkExtractor(allow=(r'/people/(\w+-?)+$',)), callback='parse_page'), # ) def __init__(self): super(CrawlSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self, spider): Request('http://www.zhihu.com/logout', method='GET', callback=self.logout) pass def logout(self): print('退出成功') pass def start_requests(self): _driver = webdriver.PhantomJS(service_log_path='./phantomjs.log') _driver.set_window_size(GetSystemMetrics(0), GetSystemMetrics(1)) self.driver = _driver (cookies, expires) = self.getcookies() if expires < time.time(): expires = False print('cookie过期') if cookies and expires: self.cookiejar = CookieJar() for key in cookies: self.cookiejar.set_cookie(cookies[key]) for url in self.start_urls: requset = Request(url, headers=self.headers, meta={'dont_merge_cookies': True, 'cookiejar': 1}, callback=self.parse_page) self.cookiejar.add_cookie_header(requset) return [requset] _driver.get("https://www.zhihu.com/#signin") # wait = WebDriverWait(driver, 12) # 等待 time.sleep(8) # 等待页面加载完毕 _xsrf = _driver.find_element_by_xpath('//input[@name="_xsrf"]') _xsrf = _xsrf.get_attribute('value') print('_xsrf------->', _xsrf) input_wrapper = _driver.find_element_by_xpath('//div[@data-za-module="SignInForm"]') # iCaptcha = True # 等待验证码加载完成 try: # input_captcha = wait.until( # EC.presence_of_element_located((By.XPATH, './/div[@class="input-wrapper captcha-module"]'))) input_captcha = input_wrapper.find_element_by_xpath('.//div[@class="input-wrapper captcha-module"]') except: try: # input_captcha = wait.until( # EC.presence_of_element_located((By.XPATH, './/div[@class="iCaptcha input-wrapper"]'))) # iCaptcha = False input_captcha = input_wrapper.find_element_by_xpath('.//div[@class="iCaptcha input-wrapper"]') except: input_captcha = None if input_captcha: hasShow = input_captcha.is_displayed() else: hasShow = False print(input_captcha, '-----captcha_url----->', hasShow) if hasShow: # 有验证码,先下载验证码 # todo 这个地方还需要在处理一下,不能直接下载验证码不然会被服务器刷新验证码 captcha_url = input_wrapper.find_element_by_xpath('.//img').get_attribute('src') print('captcha_url---->', captcha_url) _driver.close() return [Request(captcha_url, headers=self.headers, callback=self.download_captcha, meta={'_xsrf': _xsrf})] else: _driver.close() return [self.post_login(_xsrf)] def download_captcha(self, response): # 下载验证码 with open('captcha.gif', 'wb') as fp: fp.write(response.body) # 用软件打开验证码图片 os.system('start captcha.gif') # 输入验证码 print('Please enter captcha: ') captcha = input() return self.post_login(response.meta['_xsrf'], captcha) def post_login(self, _xsrf, captcha=None): formdata = {'_xsrf': _xsrf, 'password': projectsetting.PASS_WORD, # 你的密码 'captcha_type': 'cn', 'remember_me': 'true', 'email': projectsetting.USER_NAME} # 你的账号 if captcha != None: formdata['captcha'] = captcha return FormRequest("https://www.zhihu.com/login/email", method='POST', headers=self.headers, callback=self.login_result, meta={'dont_merge_cookies': True}, formdata=formdata) # 你的账号 pass def login_result(self, response): body = json.loads(response.body.decode('utf-8')) print('content---->', body) if body.get('r') != 0: return self.cookiejar = response.meta.setdefault('cookiejar', CookieJar()) self.cookiejar.extract_cookies(response, response.request) self.savecookies(self.cookiejar._cookies) for url in self.start_urls: requset = Request(url, headers=self.headers, meta={'dont_merge_cookies': True, 'cookiejar': 1}, callback=self.parse_page) yield requset pass def savecookies(self, cookies): copyCookie = dict() with open('login_cookie.json', 'w') as cookiesfile: def convterall(cookies): for key in cookies.keys(): value = cookies.get(key) if isinstance(value, Cookie): copyCookie[key] = self.class2str(value) elif isinstance(value, dict): convterall(value) convterall(cookies) self.login_cookies_dict = copyCookie cookiesfile.write(json.dumps(copyCookie)) pass def class2str(self, dictdata): dic = {} dic.update(dictdata.__dict__) return dic pass def dict2cookie(self, cookie_dict): result = {} for item in cookie_dict.items(): param = '' for key in item[1]: value = item[1][key] if type(value) == str: value = "'" + value + "'" if key[0] == '_': key = key[1:] param += '{0}={1},'.format(key, value) param = param[0:-1] evalstr = 'Cookie({0})'.format(param) result[item[0]] = eval(evalstr) return result def getcookies(self): expires = 0 if self.login_cookies: for key in self.login_cookies: expires = self.login_cookies[key].expires break return (self.login_cookies, expires) if not os.path.exists('login_cookie.json'): return (None, 0) with open('login_cookie.json', encoding='utf-8') as cookiesfile: cookiesstr = cookiesfile.read() if cookiesstr == '' or cookiesstr == None: return (None, 0) cookies = json.loads(cookiesstr) self.login_cookies_dict = cookies self.login_cookies = self.dict2cookie(cookies) expires = 0 if self.login_cookies: for key in self.login_cookies: expires = self.login_cookies[key].expires if expires != None: break return (self.login_cookies, expires) pass def parse_page(self, response): with open('users.json', 'w') as user: user.write('') sel = Selector(response) href = sel.xpath('//ul[@id="top-nav-profile-dropdown"]/li[1]/a/@href').extract()[0] print('href----->', href) cookiejar = response.meta['cookiejar'] request = Request(self.host_url + href, headers=self.headers, meta={'cookiejar': cookiejar}, callback=self.people_page) return request pass def people_page(self, response): yield self.parse_item(response) sel = Selector(response) # 关注和被关注 following = sel.xpath('//div[@class="zm-profile-side-following zg-clear"]') # todo 递归找出所有有效用户关注的数据 followings = following.xpath('.//a/@href').extract() for follow_link in followings: # yield self.cookiejar_addcookies(response, url=follow_link, callback=self.followees_page) #这样调用会重定向 还没有决解 self.webdriver_addcookies(follow_link) browerHeight = self.driver.execute_script('return document.body.scrollHeight;') while True: # do the scrolling self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(1) # 等待加载完成数据 scrollHeight = self.driver.execute_script('return document.body.scrollHeight;') if browerHeight == scrollHeight: break browerHeight = scrollHeight peoplelinks = self.driver.find_elements_by_xpath('//a[@class="zm-item-link-avatar"]') for link in peoplelinks: href = link.get_attribute('href') #某些用户的链接在这里找不到,待查找 yield self.cookiejar_addcookies(response, url=href, callback=self.people_page) pass # followees = followings[0] # 关注的链接 # followers = followings[1] # 被关注 pass def webdriver_addcookies(self, url): for key in self.login_cookies_dict: cookie = self.login_cookies_dict[key] self.driver.add_cookie({k: cookie[k] for k in ['name', 'value', 'domain', 'path']}) if url.find('http://') > -1 or url.find('https://') > -1: pass else: url = self.host_url + url self.driver.get(url) pass def cookiejar_addcookies(self, response, url, callback): cookiejar = response.meta['cookiejar'] if url.find('http://') > -1 or url.find('https://') > -1: pass else: url = self.host_url + url request = Request(url, headers=self.headers, dont_filter=True, meta={'cookiejar': cookiejar, 'dont_redirect': True, 'handle_httpstatus_list': [302]}, callback=callback) # cookiejar.add_cookie_header(request) return request pass def followees_page(self, response): if response.status in (302,) and 'Location' in response.headers: url = unquote(response.headers['Location'].decode('utf-8')) self.logger.debug( "(followees_page) Location header: %r" % response.urljoin(url)) yield self.cookiejar_addcookies(response, response.urljoin(url), self.followees_page) sel = Selector(response) peoplelinks = sel.xpath('//a[@class="zm-item-link-avatar"]/@href').extract() for link in peoplelinks: yield self.cookiejar_addcookies(response, url=link, callback=self.people_page) pass def parse_item(self, response): sel = Selector(response) following = sel.xpath('//div[@class="zm-profile-side-following zg-clear"]') followees_followers = following.xpath('.//strong/text()').extract() count = 0 for follow in followees_followers: count += int(follow) if count == 0: print('这是一个僵尸号:', response.url.replace(self.host_url + '/people/', '')) return topics_link = sel.xpath('//a[@class="zg-link-litblue"]/@href').extract() for topics in topics_link: if topics.find('topics') > -1: topics_link = topics print('topics->>>>>>>>>>>', topics_link) # 打开关注的话题 self.webdriver_addcookies(topics_link) browerHeight = self.driver.execute_script('return document.body.scrollHeight;') while True: # do the scrolling self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(1) # 等待加载完成数据 scrollHeight = self.driver.execute_script('return document.body.scrollHeight;') if browerHeight == scrollHeight: break browerHeight = scrollHeight topic_list = self.driver.find_element_by_id('zh-profile-topic-list') item = ZhihuItem() item['name'] = self.driver.find_element_by_xpath('//a[@class="name"]').text try: item['business'] = self.driver.find_element_by_xpath('//span[@class="business item"]').get_attribute( 'title') except: item['business'] = '' try: item['location'] = self.driver.find_element_by_xpath('//span[@class="location item"]').text except: item['location'] = '' topics = [] topic_divs = topic_list.find_elements_by_xpath('./div') for topic in topic_divs: section = topic.find_element_by_xpath('./div[@class="zm-profile-section-main"]') links = section.find_elements_by_tag_name('a') topicdata = links[1] topic_id = os.path.basename(topicdata.get_attribute('href')) topic_name = topicdata.find_element_by_tag_name('strong').text topic_answers = int(links.pop().text.replace(' 个回答', '')) topics.append({'topic_id': topic_id, 'topic_name': topic_name, 'topic_answers': topic_answers}) item['topics'] = topics # 临时写入文件,方便查看 # item_pipeline 写完之后才能查看,数据量过大 # with codecs.open('users.json', 'a', encoding='utf-8') as user: # line = json.dumps(dict(item)) + ',' # user.write(line.encode('latin-1').decode('unicode_escape')) return item
def test_netscape_example_1(self): #------------------------------------------------------------------- # First we check that it works for the original example at # http://www.netscape.com/newsref/std/cookie_spec.html # Client requests a document, and receives in the response: # # Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/; expires=Wednesday, 09-Nov-99 23:12:40 GMT # # When client requests a URL in path "/" on this server, it sends: # # Cookie: CUSTOMER=WILE_E_COYOTE # # Client requests a document, and receives in the response: # # Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/ # # When client requests a URL in path "/" on this server, it sends: # # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001 # # Client receives: # # Set-Cookie: SHIPPING=FEDEX; path=/fo # # When client requests a URL in path "/" on this server, it sends: # # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001 # # When client requests a URL in path "/foo" on this server, it sends: # # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001; SHIPPING=FEDEX # # The last Cookie is buggy, because both specifications say that the # most specific cookie must be sent first. SHIPPING=FEDEX is the # most specific and should thus be first. year_plus_one = time.localtime()[0] + 1 c = CookieJar(DefaultCookiePolicy(rfc2965 = True)) #req = Request("http://1.1.1.1/", # headers={"Host": "www.acme.com:80"}) req = Request("http://www.acme.com:80/", headers={"Host": "www.acme.com:80"}) headers = Headers() headers['Set-Cookie'] = 'CUSTOMER=WILE_E_COYOTE; path=/ ; expires=Wednesday, 09-Nov-%d 23:12:40 GMT' % year_plus_one res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/") c.add_cookie_header(req) self.assertEqual(req.headers.get("Cookie"), "CUSTOMER=WILE_E_COYOTE") self.assertEqual(req.headers.get("Cookie2"), '$Version="1"') headers.appendlist("Set-Cookie", "PART_NUMBER=ROCKET_LAUNCHER_0001; path=/") res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/foo/bar") c.add_cookie_header(req) h = req.headers.get("Cookie") self.assert_("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and "CUSTOMER=WILE_E_COYOTE" in h) headers.appendlist('Set-Cookie', 'SHIPPING=FEDEX; path=/foo') res = Response("http://www.acme.com", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/") c.add_cookie_header(req) h = req.headers.get("Cookie") self.assert_("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and "CUSTOMER=WILE_E_COYOTE" in h and "SHIPPING=FEDEX" not in h) req = Request("http://www.acme.com/foo/") c.add_cookie_header(req) h = req.headers.get("Cookie") self.assert_(("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and "CUSTOMER=WILE_E_COYOTE" in h and h.startswith("SHIPPING=FEDEX;")))
def process_request(self, request, spider): random_proxy = self.get_random_proxy() if random_proxy: proxy_id, proxy = random_proxy else: # self.crawler.engine.close_spider(spider, 'closespider_proxyusedup') raise CloseSpider('proxy used up') random_ua = self.get_random_ua() if random_ua: ua_id, ua = random_ua cookies_id, cookies, counter = self.get_cookies(proxy_id, ua_id) proxies = {'https': 'https://%s' % proxy, 'http': 'http://%s' % proxy} # cookies使用次数过多时,更新cookies if counter % self.max_cookies_counter == 0: cookies = None request.cookies = {} # 如果cookies不为空,则将cookies添加至request中 if cookies: # 将string 类型的cookies转换成cookiejar request.cookies = json.loads(cookies) else: logger.debug('Chrome is starting') chrome_options = Options() # chrome_options.add_argument("--headless") # 设置ua chrome_options.add_argument('--user-agent=%s' % ua) browser = webdriver.Chrome(chrome_options=chrome_options) try: browser.get('https://www.amap.com') input = browser.find_element_by_id('searchipt') input.send_keys('美食') input.send_keys(Keys.ENTER) wait = WebDriverWait(browser, 10) wait.until( EC.presence_of_element_located((By.ID, 'maptoolbox'))) except Exception as e: logger.warn('Error in selenium') else: cookies = browser.get_cookies() request.cookies = cookies finally: time.sleep(5) browser.close() if cookies: jar = CookieJar() cookies = self._get_request_cookies(jar, request) for cookie in cookies: jar.set_cookie_if_ok(cookie, request) # set Cookie header request.headers.pop('Cookie', None) jar.add_cookie_header(request) # 设置ua request.headers.setdefault(b'User-Agent', ua) request.meta['proxies'] = proxies request.meta['proxy_id'] = proxy_id request.meta['ua'] = ua request.meta['cookies_counter'] = counter request.meta['cookies_id'] = cookies_id