def create_request(self, url, response=None, **kwargs): if response is not None: cookieJar = response.meta.setdefault('cookie_jar', CookieJar()) cookieJar.extract_cookies(response, response.request) else: cookieJar = CookieJar() kwargs.update(meta={'dont_merge_cookies': True, 'cookie_jar': cookieJar}) request = Request(url, **kwargs) cookieJar.add_cookie_header(request) return request
def create_request(self, url, response=None, **kwargs): # This function could be replaced by using CookiesMiddleware instead. if response is not None: cookieJar = response.meta.setdefault('cookie_jar', CookieJar()) cookieJar.extract_cookies(response, response.request) else: cookieJar = CookieJar() kwargs.update(meta={ 'dont_merge_cookies': True, 'cookie_jar': cookieJar }) request = Request(url, **kwargs) cookieJar.add_cookie_header(request) return request
def parse(self, response): sel = Selector(response) follow_button = '//div[@class="follow-button"]/@props-data-collection-id' notebook_button = '//div[@class="follow-button"]/@props-data-notebook-id' script_collection = '//script[@data-name="collection"]/text()' cid = sel.xpath(follow_button).extract_first() or sel.xpath( notebook_button).extract_first() or sel.xpath( script_collection).re_first('"id":(\d+)') if not cid: raise ValueError('no collection articles, collection id is None.') while not self.done: cookie_jar = response.meta.setdefault('cookiejar', CookieJar()) cookie_jar.extract_cookies(response, response.request) if "/c/" in response.url: collection_url = BaseHelper.get_collection_articles_url( cid, self.page, self.count) elif "/nb/" in response.url: collection_url = BaseHelper.get_notebooks_articles_url( cid, self.page, self.count) request = Request(collection_url, headers=BaseHelper.get_headers_json(), callback=self.parse_collection) cookie_jar.add_cookie_header(request) # apply Set-Cookie ourselves request.meta['cookiejar'] = cookie_jar yield request self.page += 1
def start_requests(self): yield Request( url=self.url, headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Host': 'item.jd.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 ' 'Firefox/52.0', }, method='GET', meta={ 'dont_merge_cookies': True, 'cookiejar': CookieJar(), }, dont_filter=True, callback=self.get_comment_count)
def get_cookies_dict_from_response(response): jar = CookieJar() jar.extract_cookies(response, response.request) cookie_objs = jar.make_cookies(response, response.request) cookies = {_.name: _.value for _ in cookie_objs} return cookies
def parse(self, response): """ 登入抽屜 :param response: :return: """ # 創建cookie對象 cookie_obj = CookieJar() # 獲取cookie cookie_obj.extract_cookies(response, response.request) # 將cookie存入成員變量 self.cookie = cookie_obj._cookies # 發起登入請求 yield Request( url="https://dig.chouti.com/login", method="POST", headers={ "content-type": "application/x-www-form-urlencoded; charset=UTF-8", }, # 這裡的請求體不能用字典格式 body="phone=886918207171&password=jamie851230&oneMonth=1", cookies=cookie_obj._cookies, callback=self.check_login, )
def check_login(self, response): # 验证服务器响应,判断是否登录成功 text_json = json.loads(response.text) if not ('msg' in text_json and text_json['msg'] == '登录成功'): print('登录失败') cookie_jar = CookieJar() # 这里使用yield而用return [..]是因为该函数中存在yield,返回值是一个generator对象,其中return的返回值会被 # 放在StopIteration的信息中 yield scrapy.Request('https://www.zhihu.com/#signin', callback=self.get_captcha, meta={'cookie': cookie_jar}, dont_filter=True) raise StopIteration print('登录成功') # 模拟登陆成功,提取cookie并保存到本地 cookies = response.meta['cookie'] cookies.extract_cookies(response, response.request) with open('cookies.txt', 'w') as f: for cookie in cookies: f.write(str(cookie) + '\n') for url in self.start_urls: # dont_filter参数表明该请求不要被调度器过滤,用于对同一个请求执行多次 yield scrapy.Request(url, dont_filter=True)
def parse(self, response): #response.text首页所有内容 cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) #循环取出 cookie 并生成字典格式 for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): self.cookie_dict[m] = n.value post_dict = { 'phone': '8615915455813', 'password': '******', 'oneMonth': 1, } #发送post请求进行登录 yield Request( url='https://dig.chouti.com/login', method='POST', cookies=self.cookie_dict, headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }, body=urllib.parse.urlencode(post_dict), callback=self.parse2 #执行成功在执行回调函数 )
def parse1(self, response): # response.text 首页所有内容 from scrapy.http.cookies import CookieJar cookie_jar = CookieJar() # 对象,中封装了 cookies cookie_jar.extract_cookies(response, response.request) # 去响应中获取cookies for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): self.cookie_dict[m] = n.value post_dict = { 'phone': '8615131255089', 'password': '******', 'oneMonth': 1, } import urllib.parse # 目的:发送POST进行登录 yield Request(url="http://dig.chouti.com/login", method='POST', cookies=self.cookie_dict, body=urllib.parse.urlencode(post_dict), headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }, callback=self.parse2)
class LevelSubjects(scrapy.Spider): name = "level_subjects" # 实例化一个cookiejar对象 cookie_jar = CookieJar() def start_requests(self): # 由此方法通过下面链接爬取页面 cookie = get_valid_cookie() if cookie: yield scrapy.Request(url=LEVEL_SUBJECTS, callback=self.parse, cookies=cookie) else: pass # TODO:待添加cookie处理 def parse(self, response): sub_obj = response.css('.tip-pop').xpath('./dl') for item in sub_obj: level_name = item.xpath('./dt/text()').get() for sub_a in item.xpath('.//a'): ls_item = LevelSubjectsItem() # 授课层级名称 小学 ls_item['level_name'] = level_name # 科目名称 ls_item['subject_name'] = sub_a.xpath('./text()').get() # 科目URL ls_item['search_url'] = sub_a.xpath('./@href').get() # 科目编码 ls_item['subject_code'] = ls_item['search_url'][1:].split( '/')[0] # 授课层级Code ls_item['level_code'] = ls_item['search_url'][1:].split('/')[0] yield ls_item
def login(self,response): from scrapy.http.cookies import CookieJar cookie_jar=CookieJar() cookie_jar.extract_cookies(response,response.request) for k,v in cookie_jar._cookies.items(): for i,j in v.items(): for m,n in j.items(): self.cookie_dict[m]=n.value post_dict={ "phone": "8618001999999", "password":"******", "oneMonth":1, } import urllib.parse yield Request( url="http://dig.chouti.com/login", method='POST', cookies=self.cookie_dict, body=urllib.parse.urlencode(post_dict), headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}, callback=self.parse1 )
def parse1(self, response): """获取首页登录""" # response.text 首页所有内容 from scrapy.http.cookies import CookieJar cookie_jar = CookieJar() self.cookie_jar = cookie_jar.extract_cookies( response, response.request) # 获取响应中的cookies post_dict = { 'phone': '8617748232617', 'password': '******', 'oneMonth': 1, } import urllib.parse data = urllib.parse.urlencode( post_dict) # urlencode转换为:phone=86123&password=123&oneMonth=1这种格式 # 发送post请求准备登录 yield Request(url='http://dig.chouti.com/login', method='POST', cookies=self.cookie_jar, body=data, headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }, callback=self.parse2)
def login(self, response): '''发送ajax请求来登录''' # 从response中拿到cookie信息 cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) # for k, v in cookie_jar._cookies.items(): # for i, j in v.items(): # for m, n in j.items(): # self.cookie_dict[m] = n.value login_req = Request( url='http://dig.chouti.com/login', method='POST', headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }, body='phone=8618922795525&password=woaiwojia89&oneMonth:1', cookies=self.cookie_dict, callback=self.check_login, ) print(self.cookie_dict) print('执行了login') yield login_req
def start_requests(self): with open(self.cookie_file) as f: cookiejar = f.read() p = re.compile('\<Cookie (.*?) for .zhihu.com\/\>') cookies = re.findall(p, cookiejar) #cookies = (cookie.split('=') for cookie in cookies) mcookies = [] for cookie in cookies: lists = cookie.split('=') mcookies.append((lists[0],'='.join(lists[1:len(lists)]))) COOKIE = dict(mcookies) cookie_jar = CookieJar() if COOKIE is not False: print 'login with cookie!' return [Request('https://www.zhihu.com', meta = {'cookiejar': cookie_jar}, headers = HEADER, cookies = COOKIE, callback = self.after_login)] else: print 'login with password!' return [Request("https://www.zhihu.com/#signin", meta = {'dont_merge_cookies': True, 'cookiejar': cookie_jar}, headers = HEADER, callback = self.post_login)]
def parse(self, response): """ 第一次访问抽屉返回的内容:response :param response: :return: """ # 去响应头中获取cookie # 去响应头中获取cookie,cookie保存在cookie_jar对象 cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) # 去对象中将cookie解析到字典 for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): self.cookie_dict[m] = n.value yield Request( url='https://dig.chouti.com/login', method='POST', body= "phone=8613121758648&password=woshiniba&oneMonth=1", # # body=urlencode({})"phone=8615131255555&password=12sdf32sdf&oneMonth=1" cookies=self.cookie_dict, headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }, callback=self.check_login)
def start_crawl(self, response): cookieJar = response.meta.setdefault('cookie_jar', CookieJar()) cookieJar.extract_cookies(response, response.request) for url in self.start_urls: ZIP_CODE = 33131 TIME = str(int(time.time() * 1000)) title, product_id, sku_id, status, price = self.get_detail( url.rstrip()) if all(v is None for v in [title, product_id, sku_id, status, price]): continue get_shipping_url = f"https://www.samsclub.com/sams/shop/product/moneybox/shippingDeliveryInfo.jsp?zipCode={ZIP_CODE}&productId={product_id}&skuId={sku_id}&status={status}&isSelectedZip=true&isLoggedIn=true&_={TIME}" item = { "title": title, "status": status, "price": price, "product_id": product_id, "sku_id": sku_id } request = scrapy.Request( get_shipping_url, meta={"item": item, "cookie_jar": cookieJar}, callback=self.parse) cookieJar.add_cookie_header(request) yield request
def parse(self, response): cookie_obj = CookieJar() cookie_obj.extract_cookies(response, response.request) # print('cookie get ==================') # print(cookie_obj) # print(dir(cookie_obj)) # print(cookie_obj._cookies) # 访问cookies # print('cookie end ===================') # print(response) page = response.meta['page'] next_page = page + 1 logging.info('on parse') logging.info(f'next page ========== {next_page}') articles = response.xpath('//article[@class="excerpt"]') for article in articles: item = AsyncSandboxItem() category = article.xpath('./header/a[1]/text()').extract_first() title = article.xpath('./header/h2/a[1]/text()').extract_first() article_url = article.xpath( './header/h2/a[1]/@href').extract_first() item['title'] = title item['category'] = category item['article_url'] = article_url yield Request(url=article_url, callback=self.parse_item, meta={'item': item}) if next_page < 900: yield Request( url=self.BASE_URL.format(next_page), meta={'page': next_page}, # dont_filter=True )
def parse(self, response): login_url = 'https://passport.36kr.com/passport/sign_in' cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) for m, n in cookie_jar._cookies.items(): for m1, n1 in n.items(): for m2, n2 in n1.items(): if m2 in [ 'aliyungf_tc', 'krnewsfrontss', 'device-uid', 'M-XSRF-TOKEN' ]: self.cookie_jar[m2] = n2.value """ type:login bind:false needCaptcha:false username:18616561846 password:abcd.1234 ok_url:https%3A%2F%2Frong.36kr.com%2Flist%2Fdetail%26%3FsortField%3DHOT_SCORE ktm_reghost:null """ yield scrapy.Request( url=login_url, method='POST', body= 'type=login&bind=false&needCaptcha=false&username=18616561846&password=abcd.1234&\ ok_url=https%3A%2F%2Frong.36kr.com%2Flist%2Fdetail%26%3FsortField%3DHOT_SCORE&ktm_reghost=null', headers={'Content-Type': 'application/x-www-form-urlencoded'}, cookies=self.cookie_jar, callback=self.login)
class MailSpider(scrapy.Spider): name = 'mail' #allowed_domains = ['mail.163.com'] #start_urls = ['https://mail.163.com/'] cookie_jar = CookieJar() def start_requests(self): urls = [ 'https://dl.reg.163.com/webzj/m163_2/pub/index_dl.html?wdaId=&pkid=CvViHzl&product=mail163', 'https://dl.reg.163.com/getConf?callback=URSJSONP1524651825971&pkid=CvViHzl&pd=mail163&mode=1', 'https://dl.reg.163.com/ini?pd=mail163&pkid=CvViHzl&pkht=mail.163.com&topURL=https%3A%2F%2Fmail.163.com%2F&nocache=1524651826394' ] for url in urls: yield scrapy.Request(url=url, meta={'cookiejar': 1}, callback=self.parse) def parse(self, response): if response.url == 'https://dl.reg.163.com/ini?pd=mail163&pkid=CvViHzl&pkht=mail.163.com&topURL=https%3A%2F%2Fmail.163.com%2F&nocache=1524651826394': yield scrapy.Request(url='https://mail.163.com/', meta={'cookiejar': 1}, callback=self.parse2) def parse2(self, response): pass
def start_requests(self): cookie_jar = CookieJar() yield FormRequest(self.url0, formdata=self.data0, headers=self.header, meta={'cookiejar': cookie_jar}, callback=self.parse)
class BaseRegistroCivilSpider(scrapy.Spider): cookie_jar = CookieJar() login_url = "https://transparencia.registrocivil.org.br/registral-covid" start_urls = [] xsrf_token = "" def start_requests(self): yield self.make_login_request() def make_login_request(self): return scrapy.Request( url=self.login_url, callback=self.parse_login_response, meta={"dont_cache": True}, ) def make_request(self, *args, **kwargs): kwargs["headers"] = kwargs.get("headers", {}) kwargs["headers"]["X-XSRF-TOKEN"] = self.xsrf_token return scrapy.Request(*args, **kwargs) def start_requests_after_login(self): for url in self.start_urls: yield self.make_request(url, callback=self.parse) def parse_login_response(self, response): self.cookie_jar.extract_cookies(response, response.request) self.xsrf_token = next(c for c in self.cookie_jar if c.name == "XSRF-TOKEN").value for request in self.start_requests_after_login(): yield request def parse(self): raise NotImplementedError()
def login(self, response): cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) # 登录获取未认证的cookie for k, v in cookie_jar._cookies.items(): # print("====",v.items()) for i, j in v.items(): for m, n in j.items(): self.cookie_dict[m] = n.value # print("---",j.items()) data = { "phone": "8613476152416", "password": "******", "oneMonth": 1, } # 将字典转为url形式 phone=8613476152416&password=yt123456&oneMonth=1 # print("--",urlencode(data)) req = Request(url='https://dig.chouti.com/login', method='POST', headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }, body=urlencode(data), cookies=self.cookie_dict, callback=self.check_login) yield req
def __init__(self, start_url=None, history=True): super(Sdjnggzyjy, self).__init__() # jobs = start_url.split('|') jobs = self.start_urls self.cookie_jar = CookieJar() self.count = 0 for job in jobs: self.headers = { 'Accept application/json, text/javascript, */*;': 'q=0.01', # 'Accept-Encoding':'gzip,deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Connection': 'keep-alive', 'Content-Length': '90', 'Content-Type': 'application/json', # 'User-Agent Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101':'Firefox/64.0', 'Host': 'www.jnggzyjy.gov.cn', 'Referer': job.split()[0], 'Public-X-XSRF-TOKEN': '', # 'X-Requested-With':'XMLHttpRequest', } self.post_params.append({ "url": job.split()[0], "ba_type": job.split()[1] }) # dispatcher.connect(self.initial, signals.engine_started) self.history = history
def make_cookie_reqs(self, url, payload, xss_param): ''' Generate payloaded cookie header requests ''' two_rand_letters = random.choice(string.lowercase) + random.choice( string.lowercase) delim_str = self.delim + two_rand_letters payload = delim_str + payload + delim_str + ';9' reqs = [ Request(url, meta={ 'xss_place': 'header', 'cookiejar': CookieJar(), 'xss_param': xss_param, 'orig_url': url, 'payload': payload, 'delim': delim_str }, cookies={'userinput': payload}, callback=self.xss_chars_finder, dont_filter=True) ] if len(reqs) > 0: return reqs
def scrape_library(self, response): profile_url = response.meta['profile_url'] hxs = HtmlXPathSelector(response) for item in hxs.select( '//div[@id="content"]/table[@id="lt_catalog_list"]/tbody/tr'): book = LibraryThingLibraryItem() book['profile_url'] = profile_url cells = item.select('td') book['work_url'] = 'http://www.librarything.com%s'\ % cells[1].select('.//a/@href').extract()[0] book['work_id'] = book['work_url'].split('/')[-3] book['rating'] = cells[5].select('.//input/@value').extract()[0] book['date_added'] = str(dateutil.parser.parse(cells[6]\ .select('.//text()') .extract()[0]).date()) yield book next_page = hxs.select( '//div[@id="content"]/table//nobr/a[contains(text(), "next page")]' ) if next_page: cookie_jar = response.meta.setdefault('cookie_jar', CookieJar()) cookie_jar.extract_cookies(response, response.request) request = Request('http://www.librarything.com%s'\ % next_page.select('@href').extract()[0], meta={'profile_url': profile_url, 'dont_merge_cookies': True,\ 'cookie_jar': cookie_jar}, callback=self.scrape_library) cookie_jar.add_cookie_header(request) yield request
def login_callback(self, response): cookiejar = CookieJar() cookiejar.extract_cookies(response, response.request) self.cookiejar = cookiejar request = scrapy.http.Request(url='http://www.baiinfo.com/Search/Index?wd=%E6%95%A3%E8%A3%85%E8%BF%9B%E5%8F%A3%E6%B2%A5%E9%9D%92%E5%88%B0%E5%B2%B8%E4%BB%B7', callback = self.parse_index) return [request]
def get_cookies(self, res): '''获取登陆成功后的cookies''' cookie_jar = CookieJar() cookie_jar.extract_cookies(res, res.request) # cookiejar是类字典类型的,将它写入到文件中 with open('cookies.txt', 'w') as f: for cookie in cookie_jar: f.write(str(cookie) + '\n')
def start_requests(self): for url in self.start_urls: # 好像没cookie 不能访问 cookie_jar = CookieJar() yield Request(url, meta={'cookiejar': cookie_jar}, headers=self.head, callback=self.login_in)
def test_setitem(self): cookies = CookieJar() self.storage['new_cookies'] = cookies self.assertDictEqual( self.storage.coll.find_one({'key': 'new_cookies'}, {'_id': 0}), { 'key': 'new_cookies', 'cookiejar': pickle.dumps(cookies), 'cookies': cookies._cookies })
def extract_to_file(session=context.http_session): scrapy_jar = CookieJar(policy=session.cookies.get_policy()) for cookie in session.cookies: scrapy_jar.set_cookie(cookie) with open(cookie_name(), 'wb') as io_writer: # save it in a way scrapy_cookies can gather later on pickle.dump({None: scrapy_jar}, io_writer) os.chmod(cookie_name(), mode=0o600)