def get_cookies_dict_from_response(response): jar = CookieJar() jar.extract_cookies(response, response.request) cookie_objs = jar.make_cookies(response, response.request) cookies = {_.name: _.value for _ in cookie_objs} return cookies
def parse(self, response): """ 登入抽屜 :param response: :return: """ # 創建cookie對象 cookie_obj = CookieJar() # 獲取cookie cookie_obj.extract_cookies(response, response.request) # 將cookie存入成員變量 self.cookie = cookie_obj._cookies # 發起登入請求 yield Request( url="https://dig.chouti.com/login", method="POST", headers={ "content-type": "application/x-www-form-urlencoded; charset=UTF-8", }, # 這裡的請求體不能用字典格式 body="phone=886918207171&password=jamie851230&oneMonth=1", cookies=cookie_obj._cookies, callback=self.check_login, )
def parse1(self, response): # response.text 首页所有内容 from scrapy.http.cookies import CookieJar cookie_jar = CookieJar() # 对象,中封装了 cookies cookie_jar.extract_cookies(response, response.request) # 去响应中获取cookies for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): self.cookie_dict[m] = n.value post_dict = { 'phone': '8615131255089', 'password': '******', 'oneMonth': 1, } import urllib.parse # 目的:发送POST进行登录 yield Request(url="http://dig.chouti.com/login", method='POST', cookies=self.cookie_dict, body=urllib.parse.urlencode(post_dict), headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }, callback=self.parse2)
def parse(self, response): login_url = 'https://passport.36kr.com/passport/sign_in' cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) for m, n in cookie_jar._cookies.items(): for m1, n1 in n.items(): for m2, n2 in n1.items(): if m2 in [ 'aliyungf_tc', 'krnewsfrontss', 'device-uid', 'M-XSRF-TOKEN' ]: self.cookie_jar[m2] = n2.value """ type:login bind:false needCaptcha:false username:18616561846 password:abcd.1234 ok_url:https%3A%2F%2Frong.36kr.com%2Flist%2Fdetail%26%3FsortField%3DHOT_SCORE ktm_reghost:null """ yield scrapy.Request( url=login_url, method='POST', body= 'type=login&bind=false&needCaptcha=false&username=18616561846&password=abcd.1234&\ ok_url=https%3A%2F%2Frong.36kr.com%2Flist%2Fdetail%26%3FsortField%3DHOT_SCORE&ktm_reghost=null', headers={'Content-Type': 'application/x-www-form-urlencoded'}, cookies=self.cookie_jar, callback=self.login)
def parse(self, response): #response.text首页所有内容 cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) #循环取出 cookie 并生成字典格式 for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): self.cookie_dict[m] = n.value post_dict = { 'phone': '8615915455813', 'password': '******', 'oneMonth': 1, } #发送post请求进行登录 yield Request( url='https://dig.chouti.com/login', method='POST', cookies=self.cookie_dict, headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }, body=urllib.parse.urlencode(post_dict), callback=self.parse2 #执行成功在执行回调函数 )
def parse(self, response): cookie_obj = CookieJar() cookie_obj.extract_cookies(response, response.request) # print('cookie get ==================') # print(cookie_obj) # print(dir(cookie_obj)) # print(cookie_obj._cookies) # 访问cookies # print('cookie end ===================') # print(response) page = response.meta['page'] next_page = page + 1 logging.info('on parse') logging.info(f'next page ========== {next_page}') articles = response.xpath('//article[@class="excerpt"]') for article in articles: item = AsyncSandboxItem() category = article.xpath('./header/a[1]/text()').extract_first() title = article.xpath('./header/h2/a[1]/text()').extract_first() article_url = article.xpath( './header/h2/a[1]/@href').extract_first() item['title'] = title item['category'] = category item['article_url'] = article_url yield Request(url=article_url, callback=self.parse_item, meta={'item': item}) if next_page < 900: yield Request( url=self.BASE_URL.format(next_page), meta={'page': next_page}, # dont_filter=True )
def __init__(self, start_url=None, history=True): super(Sdjnggzyjy, self).__init__() # jobs = start_url.split('|') jobs = self.start_urls self.cookie_jar = CookieJar() self.count = 0 for job in jobs: self.headers = { 'Accept application/json, text/javascript, */*;': 'q=0.01', # 'Accept-Encoding':'gzip,deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Connection': 'keep-alive', 'Content-Length': '90', 'Content-Type': 'application/json', # 'User-Agent Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101':'Firefox/64.0', 'Host': 'www.jnggzyjy.gov.cn', 'Referer': job.split()[0], 'Public-X-XSRF-TOKEN': '', # 'X-Requested-With':'XMLHttpRequest', } self.post_params.append({ "url": job.split()[0], "ba_type": job.split()[1] }) # dispatcher.connect(self.initial, signals.engine_started) self.history = history
def login(self,response): from scrapy.http.cookies import CookieJar cookie_jar=CookieJar() cookie_jar.extract_cookies(response,response.request) for k,v in cookie_jar._cookies.items(): for i,j in v.items(): for m,n in j.items(): self.cookie_dict[m]=n.value post_dict={ "phone": "8618001999999", "password":"******", "oneMonth":1, } import urllib.parse yield Request( url="http://dig.chouti.com/login", method='POST', cookies=self.cookie_dict, body=urllib.parse.urlencode(post_dict), headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}, callback=self.parse1 )
def login(self, response): cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) # 登录获取未认证的cookie for k, v in cookie_jar._cookies.items(): # print("====",v.items()) for i, j in v.items(): for m, n in j.items(): self.cookie_dict[m] = n.value # print("---",j.items()) data = { "phone": "8613476152416", "password": "******", "oneMonth": 1, } # 将字典转为url形式 phone=8613476152416&password=yt123456&oneMonth=1 # print("--",urlencode(data)) req = Request(url='https://dig.chouti.com/login', method='POST', headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }, body=urlencode(data), cookies=self.cookie_dict, callback=self.check_login) yield req
def parse1(self, response): """获取首页登录""" # response.text 首页所有内容 from scrapy.http.cookies import CookieJar cookie_jar = CookieJar() self.cookie_jar = cookie_jar.extract_cookies( response, response.request) # 获取响应中的cookies post_dict = { 'phone': '8617748232617', 'password': '******', 'oneMonth': 1, } import urllib.parse data = urllib.parse.urlencode( post_dict) # urlencode转换为:phone=86123&password=123&oneMonth=1这种格式 # 发送post请求准备登录 yield Request(url='http://dig.chouti.com/login', method='POST', cookies=self.cookie_jar, body=data, headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }, callback=self.parse2)
def login(self, response): '''发送ajax请求来登录''' # 从response中拿到cookie信息 cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) # for k, v in cookie_jar._cookies.items(): # for i, j in v.items(): # for m, n in j.items(): # self.cookie_dict[m] = n.value login_req = Request( url='http://dig.chouti.com/login', method='POST', headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }, body='phone=8618922795525&password=woaiwojia89&oneMonth:1', cookies=self.cookie_dict, callback=self.check_login, ) print(self.cookie_dict) print('执行了login') yield login_req
def parse(self, response): """ 第一次访问抽屉返回的内容:response :param response: :return: """ # 去响应头中获取cookie # 去响应头中获取cookie,cookie保存在cookie_jar对象 cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) # 去对象中将cookie解析到字典 for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): self.cookie_dict[m] = n.value yield Request( url='https://dig.chouti.com/login', method='POST', body= "phone=8613121758648&password=woshiniba&oneMonth=1", # # body=urlencode({})"phone=8615131255555&password=12sdf32sdf&oneMonth=1" cookies=self.cookie_dict, headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }, callback=self.check_login)
def get_cookies(self, res): '''获取登陆成功后的cookies''' cookie_jar = CookieJar() cookie_jar.extract_cookies(res, res.request) # cookiejar是类字典类型的,将它写入到文件中 with open('cookies.txt', 'w') as f: for cookie in cookie_jar: f.write(str(cookie) + '\n')
def login_callback(self, response): cookiejar = CookieJar() cookiejar.extract_cookies(response, response.request) self.cookiejar = cookiejar request = scrapy.http.Request(url='http://www.baiinfo.com/Search/Index?wd=%E6%95%A3%E8%A3%85%E8%BF%9B%E5%8F%A3%E6%B2%A5%E9%9D%92%E5%88%B0%E5%B2%B8%E4%BB%B7', callback = self.parse_index) return [request]
def test_missing_final_slash(self): # Missing slash from request URL's abs_path should be assumed present. url = "http://www.acme.com" c = CookieJar(DefaultCookiePolicy(rfc2965=True)) interact_2965(c, url, "foo=bar; Version=1") req = Request(url) self.assertEquals(len(c), 1) c.add_cookie_header(req) self.assert_('Cookie' in req.headers)
def get_cookies(response) -> dict: cookie_dict = {} cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): cookie_dict[m] = n.value return cookie_dict
def extract_to_file(session=context.http_session): scrapy_jar = CookieJar(policy=session.cookies.get_policy()) for cookie in session.cookies: scrapy_jar.set_cookie(cookie) with open(cookie_name(), 'wb') as io_writer: # save it in a way scrapy_cookies can gather later on pickle.dump({None: scrapy_jar}, io_writer) os.chmod(cookie_name(), mode=0o600)
def login_callback(self, response): cookiejar = CookieJar() cookiejar.extract_cookies(response, response.request) self.cookiejar = cookiejar logging.debug('enter login_callback') request = scrapy.http.Request(url='http://oil.chem99.com/news/28156799.html', callback = self.parse_content) return [request]
def get_cookie_by_cookie_jar_from_response(response=None): cookies = {} if response: cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) for cookie in cookie_jar: cookies.update({ cookie.name: cookie.value, }) return cookies
def saveCookies(self, response): cookieJar = CookieJar() cookieJar.extract_cookies(response, response.request) for cookie in cookieJar: self.cookies[cookie.name] = cookie.value print('Log in completed!') return Request(self._discorveryUrl, headers={'Referer': self._discorveryReferer}, cookies=self.cookies, callback=self.parse)
def login_callback(self, response): cookiejar = CookieJar() cookiejar.extract_cookies(response, response.request) self.cookiejar = cookiejar for page in range(1, 2, 1): artlistURL = 'http://www.baiinfo.com.cn/Orders/NewsList/104?pageid=' + str( page) request = scrapy.http.Request(url=artlistURL, callback=self.parse_articleList) yield request
def parse_index(self, response): #获取cookie,解析cookie cookie_dict = {} from scrapy.http.cookies import CookieJar cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): cookie_dict[m] = n.value print(cookie_dict)
def create_request(self, url, response=None, **kwargs): if response is not None: cookieJar = response.meta.setdefault('cookie_jar', CookieJar()) cookieJar.extract_cookies(response, response.request) else: cookieJar = CookieJar() kwargs.update(meta={'dont_merge_cookies': True, 'cookie_jar': cookieJar}) request = Request(url, **kwargs) cookieJar.add_cookie_header(request) return request
def parse(self, response): cookie_obj = CookieJar() cookie_obj.extract_cookies(response,response.request) self.cookie = cookie_obj._cookies # 将捕获的cookie赋值给cookie yield Request( url="https://dig.chouti.com/login", method="POST", body="phone=8618938685515&password=avvcd123&oneMonth=1", # DATA提交内容 headers={"Content-Type":'application/x-www-form-urlencoded; charset=UTF-8'}, # POST方式,有要写 callback=self.check_login )
def parse(self, response): cookiejar = CookieJar() cookiejar.extract_cookies(response, response.request) if "i.waimai.meituan.com" not in cookiejar._cookies.keys(): yield self.qual_pre_requests(response.meta["cookiejar"], response.meta["retry_times"] + 1) return None cookies = cookiejar._cookies["i.waimai.meituan.com"]["/"] cookies = {key: cookies[key].value for key in cookies} post_data = {"wm_poi_id": response.meta["cookiejar"]} yield self.contruct_request(response, post_data, cookies)
def parse_province(self, response): cookieJar = CookieJar() cookieJar.extract_cookies(response, response.request) self.logger.info('++++++++++++++++++++++++++%s****%d', cookieJar._cookies, len(cookieJar._cookies)) name=cookieJar._cookies['www.ipe.org.cn']['/']['ajaxkey'].name value=cookieJar._cookies['www.ipe.org.cn']['/']['ajaxkey'].value self.cookie='{0}={1}'.format(name, value) self.logger.info('self.cookie=%s', self.cookie) for k,v in provinces.items(): frmdata=self.get_frmdata(1, k) yield scrapy.FormRequest(url=self.poll_url, formdata=frmdata, callback=self.parse_page, meta={'k':k, 'v':v})
def getTarget(self, response): print(response.request.headers) cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) print(cookie_jar) cookie_dict = dict() cookie_list = '' for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): cookie_dict[m] = n.value for i, j in cookie_dict.items(): print(i, '----------------', j)
def parse(self, response): print('输出结果如下') print(response.meta.get('depth', 0)) ##打印出当前深度出来,默认最开始是0 cookie_obj = CookieJar() ##把cookie解析出来 cookie_obj.extract_cookies(response, response.request) ##得需要一个cookie-jar的对象来获取 ##response里面有响应体,有响应头,可以去响应体里面获取cookies ''' 解析出响应头中的cookies,放到cookie_obj里面 发送数据的时候,是&的格式进行发送的,data 如果是json=什么的话,那么就是字典的形式发送过去 ''' '''
def create_request(self, url, response=None, **kwargs): # This function could be replaced by using CookiesMiddleware instead. if response is not None: cookieJar = response.meta.setdefault('cookie_jar', CookieJar()) cookieJar.extract_cookies(response, response.request) else: cookieJar = CookieJar() kwargs.update(meta={ 'dont_merge_cookies': True, 'cookie_jar': cookieJar }) request = Request(url, **kwargs) cookieJar.add_cookie_header(request) return request
def parse(self, response): cookie_obj = CookieJar() cookie_obj.extract_cookies(response, response.request) self.cookie_dict = cookie_obj._cookies # 带上用户名密码+cookie yield Request(url="http://dig.chouti.com/login", method='POST', body="phone=8615131255089&password=woshiniba&oneMonth=1", headers={ 'Content-Type': "application/x-www-form-urlencoded; charset=UTF-8" }, cookies=cookie_obj._cookies, callback=self.check_login)
def login_callback(self, response): cookiejar = CookieJar() cookiejar.extract_cookies(response, response.request) self.cookiejar = cookiejar paths = [['http://www.baiinfo.com/youse/tong', u'有色金属'] #('http://www.baiinfo.com/tiehejin/tiehejin', u'铁合金') ] for path in paths: yield scrapy.Request(url=path[0], meta={ 'productid': path[1], 'urls': path[0] }, callback=self.index_url)
def test_two_component_domain_ns(self): # Netscape: .www.bar.com, www.bar.com, .bar.com, bar.com, no domain # should all get accepted, as should .acme.com, acme.com and no domain # for 2-component domains like acme.com. c = CookieJar() # two-component V0 domain is OK interact_netscape(c, "http://foo.net/", 'ns=bar') self.assertEquals(len(c), 1) self.assertEquals(c._cookies["foo.net"]["/"]["ns"].value, "bar") self.assertEquals(interact_netscape(c, "http://foo.net/"), "ns=bar") # *will* be returned to any other domain (unlike RFC 2965)... self.assertEquals(interact_netscape(c, "http://www.foo.net/"), "ns=bar") # ...unless requested otherwise pol = DefaultCookiePolicy( strict_ns_domain=DefaultCookiePolicy.DomainStrictNonDomain) c.set_policy(pol) self.assertEquals(interact_netscape(c, "http://www.foo.net/"), "") # unlike RFC 2965, even explicit two-component domain is OK, # because .foo.net matches foo.net interact_netscape(c, "http://foo.net/foo/", 'spam1=eggs; domain=foo.net') # even if starts with a dot -- in NS rules, .foo.net matches foo.net! interact_netscape(c, "http://foo.net/foo/bar/", 'spam2=eggs; domain=.foo.net') self.assertEquals(len(c), 3) self.assertEquals(c._cookies[".foo.net"]["/foo"]["spam1"].value, "eggs") self.assertEquals(c._cookies[".foo.net"]["/foo/bar"]["spam2"].value, "eggs") self.assertEquals(interact_netscape(c, "http://foo.net/foo/bar/"), "spam2=eggs; spam1=eggs; ns=bar") # top-level domain is too general interact_netscape(c, "http://foo.net/", 'nini="ni"; domain=.net') self.assertEquals(len(c), 3) ## # Netscape protocol doesn't allow non-special top level domains (such ## # as co.uk) in the domain attribute unless there are at least three ## # dots in it. # Oh yes it does! Real implementations don't check this, and real # cookies (of course) rely on that behaviour. interact_netscape(c, "http://foo.co.uk", 'nasty=trick; domain=.co.uk') ## self.assertEquals(len(c), 2) self.assertEquals(len(c), 4)
def testMalformedCookieHeaderParsing(self): headers = Headers({'Set-Cookie': [ 'CUSTOMER=WILE_E_COYOTE; path=/; expires=Wednesday, 09-Nov-2100 23:12:40 GMT', 'PART_NUMBER=ROCKET_LAUNCHER_0001; path=/', 'SHIPPING=FEDEX; path=/foo', 'COUNTRY=UY; path=/foo', 'GOOD_CUSTOMER;', 'NO_A_BOT;']}) res = Response('http://www.perlmeister.com/foo', headers=headers) req = Request('http://www.perlmeister.com/foo') c = CookieJar() c.extract_cookies(res, req) c.add_cookie_header(req) self.assertEquals(req.headers.get('Cookie'), 'COUNTRY=UY; SHIPPING=FEDEX; CUSTOMER=WILE_E_COYOTE; ' 'PART_NUMBER=ROCKET_LAUNCHER_0001; NO_A_BOT; GOOD_CUSTOMER')
def test_empty_path(self): # Test for empty path # Broken web-server ORION/1.3.38 returns to the client response like # # Set-Cookie: JSESSIONID=ABCDERANDOM123; Path= # # ie. with Path set to nothing. # In this case, extract_cookies() must set cookie to / (root) c = CookieJar(DefaultCookiePolicy(rfc2965 = True)) headers = Headers({'Set-Cookie': 'JSESSIONID=ABCDERANDOM123; Path='}) req = Request("http://www.ants.com/") res = Response("http://www.ants.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.ants.com/") c.add_cookie_header(req) self.assertEquals(req.headers.get("Cookie"), "JSESSIONID=ABCDERANDOM123") self.assertEquals(req.headers.get("Cookie2"), '$Version="1"') # missing path in the request URI req = Request("http://www.ants.com:8080") c.add_cookie_header(req) self.assertEquals(req.headers.get("Cookie"), "JSESSIONID=ABCDERANDOM123") self.assertEquals(req.headers.get("Cookie2"), '$Version="1"')
def start_requests(self): _driver = webdriver.PhantomJS(service_log_path='./phantomjs.log') _driver.set_window_size(GetSystemMetrics(0), GetSystemMetrics(1)) self.driver = _driver (cookies, expires) = self.getcookies() if expires < time.time(): expires = False print('cookie过期') if cookies and expires: self.cookiejar = CookieJar() for key in cookies: self.cookiejar.set_cookie(cookies[key]) for url in self.start_urls: requset = Request(url, headers=self.headers, meta={'dont_merge_cookies': True, 'cookiejar': 1}, callback=self.parse_page) self.cookiejar.add_cookie_header(requset) return [requset] _driver.get("https://www.zhihu.com/#signin") # wait = WebDriverWait(driver, 12) # 等待 time.sleep(8) # 等待页面加载完毕 _xsrf = _driver.find_element_by_xpath('//input[@name="_xsrf"]') _xsrf = _xsrf.get_attribute('value') print('_xsrf------->', _xsrf) input_wrapper = _driver.find_element_by_xpath('//div[@data-za-module="SignInForm"]') # iCaptcha = True # 等待验证码加载完成 try: # input_captcha = wait.until( # EC.presence_of_element_located((By.XPATH, './/div[@class="input-wrapper captcha-module"]'))) input_captcha = input_wrapper.find_element_by_xpath('.//div[@class="input-wrapper captcha-module"]') except: try: # input_captcha = wait.until( # EC.presence_of_element_located((By.XPATH, './/div[@class="iCaptcha input-wrapper"]'))) # iCaptcha = False input_captcha = input_wrapper.find_element_by_xpath('.//div[@class="iCaptcha input-wrapper"]') except: input_captcha = None if input_captcha: hasShow = input_captcha.is_displayed() else: hasShow = False print(input_captcha, '-----captcha_url----->', hasShow) if hasShow: # 有验证码,先下载验证码 # todo 这个地方还需要在处理一下,不能直接下载验证码不然会被服务器刷新验证码 captcha_url = input_wrapper.find_element_by_xpath('.//img').get_attribute('src') print('captcha_url---->', captcha_url) _driver.close() return [Request(captcha_url, headers=self.headers, callback=self.download_captcha, meta={'_xsrf': _xsrf})] else: _driver.close() return [self.post_login(_xsrf)]
def test_netscape_example_2(self): # Second Example transaction sequence: # # Assume all mappings from above have been cleared. # # Client receives: # # Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/ # # When client requests a URL in path "/" on this server, it sends: # # Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001 # # Client receives: # # Set-Cookie: PART_NUMBER=RIDING_ROCKET_0023; path=/ammo # # When client requests a URL in path "/ammo" on this server, it sends: # # Cookie: PART_NUMBER=RIDING_ROCKET_0023; PART_NUMBER=ROCKET_LAUNCHER_0001 # # NOTE: There are two name/value pairs named "PART_NUMBER" due to # the inheritance of the "/" mapping in addition to the "/ammo" mapping. c = CookieJar() headers = Headers({'Set-Cookie': 'PART_NUMBER=ROCKET_LAUNCHER_0001; path=/'}) req = Request("http://www.acme.com/") res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/") c.add_cookie_header(req) self.assertEquals(req.headers.get("Cookie"), "PART_NUMBER=ROCKET_LAUNCHER_0001") headers.appendlist("Set-Cookie", "PART_NUMBER=RIDING_ROCKET_0023; path=/ammo") res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/ammo") c.add_cookie_header(req) self.assert_(re.search(r"PART_NUMBER=RIDING_ROCKET_0023;\s*" "PART_NUMBER=ROCKET_LAUNCHER_0001", req.headers.get("Cookie")))
def test_secure(self): for ns in True, False: for whitespace in " ", "": c = CookieJar() if ns: pol = DefaultCookiePolicy(rfc2965=False) int = interact_netscape vs = "" else: pol = DefaultCookiePolicy(rfc2965=True) int = interact_2965 vs = "; Version=1" c.set_policy(pol) url = "http://www.acme.com/" int(c, url, "foo1=bar%s%s" % (vs, whitespace)) int(c, url, "foo2=bar%s; secure%s" % (vs, whitespace)) self.assert_( not c._cookies["www.acme.com"]["/"]["foo1"].secure, "non-secure cookie registered secure") self.assert_( c._cookies["www.acme.com"]["/"]["foo2"].secure, "secure cookie registered non-secure")
def test_session_cookies(self): year_plus_one = time.localtime()[0] + 1 # Check session cookies are deleted properly by # CookieJar.clear_session_cookies method req = Request('http://www.perlmeister.com/scripts') headers = Headers() headers.appendlist("Set-Cookie", "s1=session;Path=/scripts") headers.appendlist("Set-Cookie", "p1=perm; Domain=.perlmeister.com;Path=/;expires=Fri, 02-Feb-%d 23:24:20 GMT" % year_plus_one) headers.appendlist("Set-Cookie", "p2=perm;Path=/;expires=Fri, 02-Feb-%d 23:24:20 GMT" % year_plus_one) headers.appendlist("Set-Cookie", "s2=session;Path=/scripts;" "Domain=.perlmeister.com") headers.appendlist('Set-Cookie2', 's3=session;Version=1;Discard;Path="/"') res = Response('http://www.perlmeister.com/scripts', headers=headers) c = CookieJar() c.extract_cookies(res, req) # How many session/permanent cookies do we have? counter = {"session_after": 0, "perm_after": 0, "session_before": 0, "perm_before": 0} for cookie in c: key = "%s_before" % cookie.value counter[key] = counter[key] + 1 c.clear_session_cookies() # How many now? for cookie in c: key = "%s_after" % cookie.value counter[key] = counter[key] + 1 self.assert_(not ( # a permanent cookie got lost accidently counter["perm_after"] != counter["perm_before"] or # a session cookie hasn't been cleared counter["session_after"] != 0 or # we didn't have session cookies in the first place counter["session_before"] == 0))
def test_expires(self): from cookielib import time2netscape # if expires is in future, keep cookie... c = CookieJar() future = time2netscape(time.time()+3600) interact_netscape(c, "http://www.acme.com/", 'spam="bar"; expires=%s' % future) self.assertEquals(len(c), 1) now = time2netscape(time.time()-1) # ... and if in past or present, discard it interact_netscape(c, "http://www.acme.com/", 'foo="eggs"; expires=%s' % now) h = interact_netscape(c, "http://www.acme.com/") self.assertEquals(len(c), 1) self.assert_('spam="bar"' in h and "foo" not in h) # max-age takes precedence over expires, and zero max-age is request to # delete both new cookie and any old matching cookie interact_netscape(c, "http://www.acme.com/", 'eggs="bar"; expires=%s' % future) interact_netscape(c, "http://www.acme.com/", 'bar="bar"; expires=%s' % future) self.assertEquals(len(c), 3) interact_netscape(c, "http://www.acme.com/", 'eggs="bar"; ' 'expires=%s; max-age=0' % future) interact_netscape(c, "http://www.acme.com/", 'bar="bar"; ' 'max-age=0; expires=%s' % future) h = interact_netscape(c, "http://www.acme.com/") self.assertEquals(len(c), 1) # test expiry at end of session for cookies with no expires attribute interact_netscape(c, "http://www.rhubarb.net/", 'whum="fizz"') self.assertEquals(len(c), 2) c.clear_session_cookies() self.assertEquals(len(c), 1) self.assert_('spam="bar"' in h)
def test_netscape_misc(self): # Some additional Netscape cookies tests. c = CookieJar() headers = Headers() req = Request("http://foo.bar.acme.com/foo") # Netscape allows a host part that contains dots headers.appendlist("Set-Cookie", "Customer=WILE_E_COYOTE; domain=.acme.com") res = Response("http://www.acme.com/foo", headers=headers) c.extract_cookies(res, req) # and that the domain is the same as the host without adding a leading # dot to the domain. Should not quote even if strange chars are used # in the cookie value. headers.appendlist("Set-Cookie", "PART_NUMBER=3,4; domain=foo.bar.acme.com") res = Response("http://www.acme.com/foo", headers=headers) c.extract_cookies(res, req) req = Request("http://foo.bar.acme.com/foo") c.add_cookie_header(req) self.assert_( "PART_NUMBER=3,4" in req.headers.get("Cookie") and "Customer=WILE_E_COYOTE" in req.headers.get("Cookie"))
class zhihuCrawler(CrawlSpider): allowed_domains = ["www.zhihu.com"] host_url = "https://www.zhihu.com" start_urls = [ "https://www.zhihu.com" ] headers = { 'Connection': 'Keep-Alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': 'text / html, application / xhtml + xml, image / jxr, * / *', 'Accept-Language': 'zh-Hans-CN,zh-Hans;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586', 'Host': 'www.zhihu.com', 'Referer': 'https://www.zhihu.com/' # ' Cookie': '_za=5852b28b-399a-4bd8-8282-59070203151f; _xsrf=7d7cdde47226ee4e485a9cc9925f2715; __utmc=51854390; q_c1=73b48dcd9e84486f81814ea556dac319|1468220250000|1468220250000; l_cap_id=NjFiN2M2YzBmYmMwNDRmODk3ZGU3NTQ0ODllMzYyYzY=|1468827275|ccd88305461b2a3f2d9c38ec5c651e1bfcba81de; cap_id=ZGQ1MjFjMzM5MGI2NDY5ZmFjMGQ5NzMxODI2M2EzNWM=|1468827275|e44ef0b232dd85e4a62077a6a67e83ccbe963692; _zap=82d8c931-4ad6-464b-8e3f-2e430cce84e0; d_c0=AIBAeHJDNgqPTo5KKrizojLF6zLSb8c38qo=|1468220251; login=ZGNlMjUwYzNjNmMxNDI0N2I1YjQyMjVlMDM3YjMwN2Y=|1468827275|1c4c6a2dd0dec9d3948653906728e6ceb22154b2; __utma=51854390.1408714905.1468575990.1468819740.1468824510.5; __utmz=51854390.1468824510.5.4.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/topic/19552832/top-answers; __utmv=51854390.000--|2=registration_date=20130613=1^3=entry_date=20160711=1; __utmb=51854390.8.10.1468824510; n_c=1' } name = 'zhihu' _xsrf = '' cookiejar = CookieJar() driver = None login_cookies = None login_cookies_dict = None # handle_httpstatus_list = [302] # rules = ( # Rule(SgmlLinkExtractor(allow=(r'/question/\d+',)), follow=True), # Rule(SgmlLinkExtractor(allow=(r'/people/(\w+-?)+$',)), callback='parse_page'), # ) def __init__(self): super(CrawlSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self, spider): Request('http://www.zhihu.com/logout', method='GET', callback=self.logout) pass def logout(self): print('退出成功') pass def start_requests(self): _driver = webdriver.PhantomJS(service_log_path='./phantomjs.log') _driver.set_window_size(GetSystemMetrics(0), GetSystemMetrics(1)) self.driver = _driver (cookies, expires) = self.getcookies() if expires < time.time(): expires = False print('cookie过期') if cookies and expires: self.cookiejar = CookieJar() for key in cookies: self.cookiejar.set_cookie(cookies[key]) for url in self.start_urls: requset = Request(url, headers=self.headers, meta={'dont_merge_cookies': True, 'cookiejar': 1}, callback=self.parse_page) self.cookiejar.add_cookie_header(requset) return [requset] _driver.get("https://www.zhihu.com/#signin") # wait = WebDriverWait(driver, 12) # 等待 time.sleep(8) # 等待页面加载完毕 _xsrf = _driver.find_element_by_xpath('//input[@name="_xsrf"]') _xsrf = _xsrf.get_attribute('value') print('_xsrf------->', _xsrf) input_wrapper = _driver.find_element_by_xpath('//div[@data-za-module="SignInForm"]') # iCaptcha = True # 等待验证码加载完成 try: # input_captcha = wait.until( # EC.presence_of_element_located((By.XPATH, './/div[@class="input-wrapper captcha-module"]'))) input_captcha = input_wrapper.find_element_by_xpath('.//div[@class="input-wrapper captcha-module"]') except: try: # input_captcha = wait.until( # EC.presence_of_element_located((By.XPATH, './/div[@class="iCaptcha input-wrapper"]'))) # iCaptcha = False input_captcha = input_wrapper.find_element_by_xpath('.//div[@class="iCaptcha input-wrapper"]') except: input_captcha = None if input_captcha: hasShow = input_captcha.is_displayed() else: hasShow = False print(input_captcha, '-----captcha_url----->', hasShow) if hasShow: # 有验证码,先下载验证码 # todo 这个地方还需要在处理一下,不能直接下载验证码不然会被服务器刷新验证码 captcha_url = input_wrapper.find_element_by_xpath('.//img').get_attribute('src') print('captcha_url---->', captcha_url) _driver.close() return [Request(captcha_url, headers=self.headers, callback=self.download_captcha, meta={'_xsrf': _xsrf})] else: _driver.close() return [self.post_login(_xsrf)] def download_captcha(self, response): # 下载验证码 with open('captcha.gif', 'wb') as fp: fp.write(response.body) # 用软件打开验证码图片 os.system('start captcha.gif') # 输入验证码 print('Please enter captcha: ') captcha = input() return self.post_login(response.meta['_xsrf'], captcha) def post_login(self, _xsrf, captcha=None): formdata = {'_xsrf': _xsrf, 'password': projectsetting.PASS_WORD, # 你的密码 'captcha_type': 'cn', 'remember_me': 'true', 'email': projectsetting.USER_NAME} # 你的账号 if captcha != None: formdata['captcha'] = captcha return FormRequest("https://www.zhihu.com/login/email", method='POST', headers=self.headers, callback=self.login_result, meta={'dont_merge_cookies': True}, formdata=formdata) # 你的账号 pass def login_result(self, response): body = json.loads(response.body.decode('utf-8')) print('content---->', body) if body.get('r') != 0: return self.cookiejar = response.meta.setdefault('cookiejar', CookieJar()) self.cookiejar.extract_cookies(response, response.request) self.savecookies(self.cookiejar._cookies) for url in self.start_urls: requset = Request(url, headers=self.headers, meta={'dont_merge_cookies': True, 'cookiejar': 1}, callback=self.parse_page) yield requset pass def savecookies(self, cookies): copyCookie = dict() with open('login_cookie.json', 'w') as cookiesfile: def convterall(cookies): for key in cookies.keys(): value = cookies.get(key) if isinstance(value, Cookie): copyCookie[key] = self.class2str(value) elif isinstance(value, dict): convterall(value) convterall(cookies) self.login_cookies_dict = copyCookie cookiesfile.write(json.dumps(copyCookie)) pass def class2str(self, dictdata): dic = {} dic.update(dictdata.__dict__) return dic pass def dict2cookie(self, cookie_dict): result = {} for item in cookie_dict.items(): param = '' for key in item[1]: value = item[1][key] if type(value) == str: value = "'" + value + "'" if key[0] == '_': key = key[1:] param += '{0}={1},'.format(key, value) param = param[0:-1] evalstr = 'Cookie({0})'.format(param) result[item[0]] = eval(evalstr) return result def getcookies(self): expires = 0 if self.login_cookies: for key in self.login_cookies: expires = self.login_cookies[key].expires break return (self.login_cookies, expires) if not os.path.exists('login_cookie.json'): return (None, 0) with open('login_cookie.json', encoding='utf-8') as cookiesfile: cookiesstr = cookiesfile.read() if cookiesstr == '' or cookiesstr == None: return (None, 0) cookies = json.loads(cookiesstr) self.login_cookies_dict = cookies self.login_cookies = self.dict2cookie(cookies) expires = 0 if self.login_cookies: for key in self.login_cookies: expires = self.login_cookies[key].expires if expires != None: break return (self.login_cookies, expires) pass def parse_page(self, response): with open('users.json', 'w') as user: user.write('') sel = Selector(response) href = sel.xpath('//ul[@id="top-nav-profile-dropdown"]/li[1]/a/@href').extract()[0] print('href----->', href) cookiejar = response.meta['cookiejar'] request = Request(self.host_url + href, headers=self.headers, meta={'cookiejar': cookiejar}, callback=self.people_page) return request pass def people_page(self, response): yield self.parse_item(response) sel = Selector(response) # 关注和被关注 following = sel.xpath('//div[@class="zm-profile-side-following zg-clear"]') # todo 递归找出所有有效用户关注的数据 followings = following.xpath('.//a/@href').extract() for follow_link in followings: # yield self.cookiejar_addcookies(response, url=follow_link, callback=self.followees_page) #这样调用会重定向 还没有决解 self.webdriver_addcookies(follow_link) browerHeight = self.driver.execute_script('return document.body.scrollHeight;') while True: # do the scrolling self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(1) # 等待加载完成数据 scrollHeight = self.driver.execute_script('return document.body.scrollHeight;') if browerHeight == scrollHeight: break browerHeight = scrollHeight peoplelinks = self.driver.find_elements_by_xpath('//a[@class="zm-item-link-avatar"]') for link in peoplelinks: href = link.get_attribute('href') #某些用户的链接在这里找不到,待查找 yield self.cookiejar_addcookies(response, url=href, callback=self.people_page) pass # followees = followings[0] # 关注的链接 # followers = followings[1] # 被关注 pass def webdriver_addcookies(self, url): for key in self.login_cookies_dict: cookie = self.login_cookies_dict[key] self.driver.add_cookie({k: cookie[k] for k in ['name', 'value', 'domain', 'path']}) if url.find('http://') > -1 or url.find('https://') > -1: pass else: url = self.host_url + url self.driver.get(url) pass def cookiejar_addcookies(self, response, url, callback): cookiejar = response.meta['cookiejar'] if url.find('http://') > -1 or url.find('https://') > -1: pass else: url = self.host_url + url request = Request(url, headers=self.headers, dont_filter=True, meta={'cookiejar': cookiejar, 'dont_redirect': True, 'handle_httpstatus_list': [302]}, callback=callback) # cookiejar.add_cookie_header(request) return request pass def followees_page(self, response): if response.status in (302,) and 'Location' in response.headers: url = unquote(response.headers['Location'].decode('utf-8')) self.logger.debug( "(followees_page) Location header: %r" % response.urljoin(url)) yield self.cookiejar_addcookies(response, response.urljoin(url), self.followees_page) sel = Selector(response) peoplelinks = sel.xpath('//a[@class="zm-item-link-avatar"]/@href').extract() for link in peoplelinks: yield self.cookiejar_addcookies(response, url=link, callback=self.people_page) pass def parse_item(self, response): sel = Selector(response) following = sel.xpath('//div[@class="zm-profile-side-following zg-clear"]') followees_followers = following.xpath('.//strong/text()').extract() count = 0 for follow in followees_followers: count += int(follow) if count == 0: print('这是一个僵尸号:', response.url.replace(self.host_url + '/people/', '')) return topics_link = sel.xpath('//a[@class="zg-link-litblue"]/@href').extract() for topics in topics_link: if topics.find('topics') > -1: topics_link = topics print('topics->>>>>>>>>>>', topics_link) # 打开关注的话题 self.webdriver_addcookies(topics_link) browerHeight = self.driver.execute_script('return document.body.scrollHeight;') while True: # do the scrolling self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(1) # 等待加载完成数据 scrollHeight = self.driver.execute_script('return document.body.scrollHeight;') if browerHeight == scrollHeight: break browerHeight = scrollHeight topic_list = self.driver.find_element_by_id('zh-profile-topic-list') item = ZhihuItem() item['name'] = self.driver.find_element_by_xpath('//a[@class="name"]').text try: item['business'] = self.driver.find_element_by_xpath('//span[@class="business item"]').get_attribute( 'title') except: item['business'] = '' try: item['location'] = self.driver.find_element_by_xpath('//span[@class="location item"]').text except: item['location'] = '' topics = [] topic_divs = topic_list.find_elements_by_xpath('./div') for topic in topic_divs: section = topic.find_element_by_xpath('./div[@class="zm-profile-section-main"]') links = section.find_elements_by_tag_name('a') topicdata = links[1] topic_id = os.path.basename(topicdata.get_attribute('href')) topic_name = topicdata.find_element_by_tag_name('strong').text topic_answers = int(links.pop().text.replace(' 个回答', '')) topics.append({'topic_id': topic_id, 'topic_name': topic_name, 'topic_answers': topic_answers}) item['topics'] = topics # 临时写入文件,方便查看 # item_pipeline 写完之后才能查看,数据量过大 # with codecs.open('users.json', 'a', encoding='utf-8') as user: # line = json.dumps(dict(item)) + ',' # user.write(line.encode('latin-1').decode('unicode_escape')) return item
def test_domain_allow(self): c = CookieJar(policy=DefaultCookiePolicy( blocked_domains=["acme.com"], allowed_domains=["www.acme.com"])) req = Request("http://acme.com/") headers = {"Set-Cookie": "CUSTOMER=WILE_E_COYOTE; path=/"} res = Response("http://acme.com/", headers=headers) c.extract_cookies(res, req) self.assertEquals(len(c), 0) req = Request("http://www.acme.com/") res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) self.assertEquals(len(c), 1) req = Request("http://www.coyote.com/") res = Response("http://www.coyote.com/", headers=headers) c.extract_cookies(res, req) self.assertEquals(len(c), 1) # set a cookie with non-allowed domain... req = Request("http://www.coyote.com/") res = Response("http://www.coyote.com/", headers=headers) cookies = c.make_cookies(res, req) c.set_cookie(cookies[0]) self.assertEquals(len(c), 2) # ... and check is doesn't get returned c.add_cookie_header(req) assert 'Cookie' not in req.headers
def test_domain_block(self): pol = DefaultCookiePolicy( rfc2965=True, blocked_domains=[".acme.com"]) c = CookieJar(policy=pol) headers = {'Set-Cookie': 'CUSTOMER=WILE_E_COYOTE; path=/'} req = Request("http://www.acme.com/") res = Response('http://www.acme.com/', headers=headers) c.extract_cookies(res, req) self.assertEquals(len(c), 0) p = pol.set_blocked_domains(["acme.com"]) c.extract_cookies(res, req) self.assertEquals(len(c), 1) c.clear() req = Request("http://www.roadrunner.net/") res = Response("http://www.roadrunner.net/", headers=headers) c.extract_cookies(res, req) self.assertEquals(len(c), 1) req = Request("http://www.roadrunner.net/") c.add_cookie_header(req) assert 'Cookie' in req.headers and 'Cookie2' in req.headers c.clear() pol.set_blocked_domains([".acme.com"]) c.extract_cookies(res, req) self.assertEquals(len(c), 1) # set a cookie with blocked domain... req = Request("http://www.acme.com/") res = Response("http://www.acme.com/", headers=headers) cookies = c.make_cookies(res, req) c.set_cookie(cookies[0]) self.assertEquals(len(c), 2) # ... and check is doesn't get returned c.add_cookie_header(req) assert 'Cookie' not in req.headers
def cookiejar_from_cookie_headers(headers): c = CookieJar() req = Request("http://www.example.com/") r = Response("http://www.example.com/", headers=headers) c.extract_cookies(r, req) return c
def test_netscape_example_1(self): #------------------------------------------------------------------- # First we check that it works for the original example at # http://www.netscape.com/newsref/std/cookie_spec.html # Client requests a document, and receives in the response: # # Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/; expires=Wednesday, 09-Nov-99 23:12:40 GMT # # When client requests a URL in path "/" on this server, it sends: # # Cookie: CUSTOMER=WILE_E_COYOTE # # Client requests a document, and receives in the response: # # Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/ # # When client requests a URL in path "/" on this server, it sends: # # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001 # # Client receives: # # Set-Cookie: SHIPPING=FEDEX; path=/fo # # When client requests a URL in path "/" on this server, it sends: # # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001 # # When client requests a URL in path "/foo" on this server, it sends: # # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001; SHIPPING=FEDEX # # The last Cookie is buggy, because both specifications say that the # most specific cookie must be sent first. SHIPPING=FEDEX is the # most specific and should thus be first. year_plus_one = time.localtime()[0] + 1 c = CookieJar(DefaultCookiePolicy(rfc2965 = True)) #req = Request("http://1.1.1.1/", # headers={"Host": "www.acme.com:80"}) req = Request("http://www.acme.com:80/", headers={"Host": "www.acme.com:80"}) headers = Headers() headers['Set-Cookie'] = 'CUSTOMER=WILE_E_COYOTE; path=/ ; expires=Wednesday, 09-Nov-%d 23:12:40 GMT' % year_plus_one res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/") c.add_cookie_header(req) self.assertEqual(req.headers.get("Cookie"), "CUSTOMER=WILE_E_COYOTE") self.assertEqual(req.headers.get("Cookie2"), '$Version="1"') headers.appendlist("Set-Cookie", "PART_NUMBER=ROCKET_LAUNCHER_0001; path=/") res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/foo/bar") c.add_cookie_header(req) h = req.headers.get("Cookie") self.assert_("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and "CUSTOMER=WILE_E_COYOTE" in h) headers.appendlist('Set-Cookie', 'SHIPPING=FEDEX; path=/foo') res = Response("http://www.acme.com", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/") c.add_cookie_header(req) h = req.headers.get("Cookie") self.assert_("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and "CUSTOMER=WILE_E_COYOTE" in h and "SHIPPING=FEDEX" not in h) req = Request("http://www.acme.com/foo/") c.add_cookie_header(req) h = req.headers.get("Cookie") self.assert_(("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and "CUSTOMER=WILE_E_COYOTE" in h and h.startswith("SHIPPING=FEDEX;")))