예제 #1
0
 def create_request(self, url, response=None, **kwargs):
     if response is not None:
         cookieJar = response.meta.setdefault('cookie_jar', CookieJar())
         cookieJar.extract_cookies(response, response.request)
     else:
         cookieJar = CookieJar()
     kwargs.update(meta={'dont_merge_cookies': True,
                         'cookie_jar': cookieJar})
     request = Request(url, **kwargs)
     cookieJar.add_cookie_header(request)
     return request
 def create_request(self, url, response=None, **kwargs):
     # This function could be replaced by using CookiesMiddleware instead.
     if response is not None:
         cookieJar = response.meta.setdefault('cookie_jar', CookieJar())
         cookieJar.extract_cookies(response, response.request)
     else:
         cookieJar = CookieJar()
     kwargs.update(meta={
         'dont_merge_cookies': True,
         'cookie_jar': cookieJar
     })
     request = Request(url, **kwargs)
     cookieJar.add_cookie_header(request)
     return request
예제 #3
0
 def parse(self, response):
     sel = Selector(response)
     follow_button = '//div[@class="follow-button"]/@props-data-collection-id'
     notebook_button = '//div[@class="follow-button"]/@props-data-notebook-id'
     script_collection = '//script[@data-name="collection"]/text()'
     cid = sel.xpath(follow_button).extract_first() or sel.xpath(
         notebook_button).extract_first() or sel.xpath(
             script_collection).re_first('"id":(\d+)')
     if not cid:
         raise ValueError('no collection articles, collection id is None.')
     while not self.done:
         cookie_jar = response.meta.setdefault('cookiejar', CookieJar())
         cookie_jar.extract_cookies(response, response.request)
         if "/c/" in response.url:
             collection_url = BaseHelper.get_collection_articles_url(
                 cid, self.page, self.count)
         elif "/nb/" in response.url:
             collection_url = BaseHelper.get_notebooks_articles_url(
                 cid, self.page, self.count)
         request = Request(collection_url,
                           headers=BaseHelper.get_headers_json(),
                           callback=self.parse_collection)
         cookie_jar.add_cookie_header(request)  # apply Set-Cookie ourselves
         request.meta['cookiejar'] = cookie_jar
         yield request
         self.page += 1
예제 #4
0
 def start_requests(self):
     yield Request(
         url=self.url,
         headers={
             'Accept':
             'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
             'Accept-Encoding':
             'gzip, deflate, br',
             'Accept-Language':
             'en-US,en;q=0.5',
             'Connection':
             'keep-alive',
             'Host':
             'item.jd.com',
             'Upgrade-Insecure-Requests':
             '1',
             'User-Agent':
             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 '
             'Firefox/52.0',
         },
         method='GET',
         meta={
             'dont_merge_cookies': True,
             'cookiejar': CookieJar(),
         },
         dont_filter=True,
         callback=self.get_comment_count)
예제 #5
0
def get_cookies_dict_from_response(response):
    jar = CookieJar()
    jar.extract_cookies(response, response.request)
    cookie_objs = jar.make_cookies(response, response.request)
    cookies = {_.name: _.value for _ in cookie_objs}

    return cookies
예제 #6
0
    def parse(self, response):
        """
        登入抽屜
        :param response:
        :return:
        """
        # 創建cookie對象
        cookie_obj = CookieJar()
        # 獲取cookie
        cookie_obj.extract_cookies(response, response.request)
        # 將cookie存入成員變量
        self.cookie = cookie_obj._cookies

        # 發起登入請求
        yield Request(
            url="https://dig.chouti.com/login",
            method="POST",
            headers={
                "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
            },
            # 這裡的請求體不能用字典格式
            body="phone=886918207171&password=jamie851230&oneMonth=1",
            cookies=cookie_obj._cookies,
            callback=self.check_login,
        )
예제 #7
0
    def check_login(self, response):
        # 验证服务器响应,判断是否登录成功
        text_json = json.loads(response.text)
        if not ('msg' in text_json and text_json['msg'] == '登录成功'):
            print('登录失败')
            cookie_jar = CookieJar()
            # 这里使用yield而用return [..]是因为该函数中存在yield,返回值是一个generator对象,其中return的返回值会被
            # 放在StopIteration的信息中
            yield scrapy.Request('https://www.zhihu.com/#signin',
                                 callback=self.get_captcha,
                                 meta={'cookie': cookie_jar},
                                 dont_filter=True)
            raise StopIteration

        print('登录成功')
        # 模拟登陆成功,提取cookie并保存到本地
        cookies = response.meta['cookie']
        cookies.extract_cookies(response, response.request)
        with open('cookies.txt', 'w') as f:
            for cookie in cookies:
                f.write(str(cookie) + '\n')

        for url in self.start_urls:
            # dont_filter参数表明该请求不要被调度器过滤,用于对同一个请求执行多次
            yield scrapy.Request(url, dont_filter=True)
예제 #8
0
    def parse(self, response):
        #response.text首页所有内容
        cookie_jar = CookieJar()
        cookie_jar.extract_cookies(response, response.request)

        #循环取出 cookie 并生成字典格式
        for k, v in cookie_jar._cookies.items():
            for i, j in v.items():
                for m, n in j.items():
                    self.cookie_dict[m] = n.value

        post_dict = {
            'phone': '8615915455813',
            'password': '******',
            'oneMonth': 1,
        }

        #发送post请求进行登录
        yield Request(
            url='https://dig.chouti.com/login',
            method='POST',
            cookies=self.cookie_dict,
            headers={
                'Content-Type':
                'application/x-www-form-urlencoded; charset=UTF-8'
            },
            body=urllib.parse.urlencode(post_dict),
            callback=self.parse2  #执行成功在执行回调函数
        )
예제 #9
0
파일: chouti.py 프로젝트: Mitsui1993/scrapy
    def parse1(self, response):
        # response.text 首页所有内容
        from scrapy.http.cookies import CookieJar
        cookie_jar = CookieJar()  # 对象,中封装了 cookies
        cookie_jar.extract_cookies(response, response.request)  # 去响应中获取cookies

        for k, v in cookie_jar._cookies.items():
            for i, j in v.items():
                for m, n in j.items():
                    self.cookie_dict[m] = n.value
        post_dict = {
            'phone': '8615131255089',
            'password': '******',
            'oneMonth': 1,
        }
        import urllib.parse

        # 目的:发送POST进行登录
        yield Request(url="http://dig.chouti.com/login",
                      method='POST',
                      cookies=self.cookie_dict,
                      body=urllib.parse.urlencode(post_dict),
                      headers={
                          'Content-Type':
                          'application/x-www-form-urlencoded; charset=UTF-8'
                      },
                      callback=self.parse2)
예제 #10
0
class LevelSubjects(scrapy.Spider):
    name = "level_subjects"
    # 实例化一个cookiejar对象
    cookie_jar = CookieJar()

    def start_requests(self):  # 由此方法通过下面链接爬取页面
        cookie = get_valid_cookie()

        if cookie:
            yield scrapy.Request(url=LEVEL_SUBJECTS,
                                 callback=self.parse,
                                 cookies=cookie)
        else:
            pass  # TODO:待添加cookie处理

    def parse(self, response):
        sub_obj = response.css('.tip-pop').xpath('./dl')
        for item in sub_obj:
            level_name = item.xpath('./dt/text()').get()
            for sub_a in item.xpath('.//a'):
                ls_item = LevelSubjectsItem()
                # 授课层级名称 小学
                ls_item['level_name'] = level_name
                # 科目名称
                ls_item['subject_name'] = sub_a.xpath('./text()').get()
                # 科目URL
                ls_item['search_url'] = sub_a.xpath('./@href').get()
                # 科目编码
                ls_item['subject_code'] = ls_item['search_url'][1:].split(
                    '/')[0]
                # 授课层级Code
                ls_item['level_code'] = ls_item['search_url'][1:].split('/')[0]
                yield ls_item
예제 #11
0
	def login(self,response):
		from scrapy.http.cookies import CookieJar
		cookie_jar=CookieJar()
		cookie_jar.extract_cookies(response,response.request)

		for k,v in cookie_jar._cookies.items():
			for i,j in v.items():
				for m,n in j.items():
					self.cookie_dict[m]=n.value

		post_dict={
			"phone": "8618001999999",
			"password":"******",
			"oneMonth":1,
		}

		import urllib.parse

		yield Request(
			url="http://dig.chouti.com/login",
			method='POST',
			cookies=self.cookie_dict,
			body=urllib.parse.urlencode(post_dict),
			headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'},
			callback=self.parse1
		)
예제 #12
0
    def parse1(self, response):
        """获取首页登录"""
        # response.text 首页所有内容
        from scrapy.http.cookies import CookieJar
        cookie_jar = CookieJar()
        self.cookie_jar = cookie_jar.extract_cookies(
            response, response.request)  # 获取响应中的cookies

        post_dict = {
            'phone': '8617748232617',
            'password': '******',
            'oneMonth': 1,
        }

        import urllib.parse
        data = urllib.parse.urlencode(
            post_dict)  # urlencode转换为:phone=86123&password=123&oneMonth=1这种格式
        # 发送post请求准备登录
        yield Request(url='http://dig.chouti.com/login',
                      method='POST',
                      cookies=self.cookie_jar,
                      body=data,
                      headers={
                          'Content-Type':
                          'application/x-www-form-urlencoded; charset=UTF-8'
                      },
                      callback=self.parse2)
예제 #13
0
    def login(self, response):
        '''发送ajax请求来登录'''

        # 从response中拿到cookie信息

        cookie_jar = CookieJar()
        cookie_jar.extract_cookies(response, response.request)

        # for k, v in cookie_jar._cookies.items():
        #     for i, j in v.items():
        #         for m, n in j.items():
        #             self.cookie_dict[m] = n.value

        login_req = Request(
            url='http://dig.chouti.com/login',
            method='POST',
            headers={
                'Content-Type':
                'application/x-www-form-urlencoded; charset=UTF-8'
            },
            body='phone=8618922795525&password=woaiwojia89&oneMonth:1',
            cookies=self.cookie_dict,
            callback=self.check_login,
        )
        print(self.cookie_dict)
        print('执行了login')
        yield login_req
예제 #14
0
        def start_requests(self):
		with open(self.cookie_file) as f:
                        cookiejar = f.read()
                        p = re.compile('\<Cookie (.*?) for .zhihu.com\/\>')
                        cookies = re.findall(p, cookiejar)
                        #cookies = (cookie.split('=') for cookie in cookies)
                        mcookies = []
                        for cookie in cookies:
                                lists = cookie.split('=')
                                mcookies.append((lists[0],'='.join(lists[1:len(lists)])))
                        COOKIE = dict(mcookies)
		cookie_jar = CookieJar()
                if COOKIE is not False:
			print 'login with cookie!'
                        return [Request('https://www.zhihu.com',
                                meta = {'cookiejar': cookie_jar},
                                headers = HEADER,
                                cookies = COOKIE,
                                callback = self.after_login)]
                else:
			print 'login with password!'
                        return [Request("https://www.zhihu.com/#signin",
                                meta = {'dont_merge_cookies': True, 'cookiejar': cookie_jar},
				headers = HEADER,
                                callback = self.post_login)]
예제 #15
0
    def parse(self, response):
        """
        第一次访问抽屉返回的内容:response
        :param response:
        :return:
        """

        # 去响应头中获取cookie

        # 去响应头中获取cookie,cookie保存在cookie_jar对象
        cookie_jar = CookieJar()
        cookie_jar.extract_cookies(response, response.request)

        # 去对象中将cookie解析到字典
        for k, v in cookie_jar._cookies.items():
            for i, j in v.items():
                for m, n in j.items():
                    self.cookie_dict[m] = n.value

        yield Request(
            url='https://dig.chouti.com/login',
            method='POST',
            body=
            "phone=8613121758648&password=woshiniba&oneMonth=1",  # # body=urlencode({})"phone=8615131255555&password=12sdf32sdf&oneMonth=1"
            cookies=self.cookie_dict,
            headers={
                'Content-Type':
                'application/x-www-form-urlencoded; charset=UTF-8'
            },
            callback=self.check_login)
예제 #16
0
    def start_crawl(self, response):
        cookieJar = response.meta.setdefault('cookie_jar', CookieJar())
        cookieJar.extract_cookies(response, response.request)

        for url in self.start_urls:
            ZIP_CODE = 33131
            TIME = str(int(time.time() * 1000))
            title, product_id, sku_id, status, price = self.get_detail(
                url.rstrip())

            if all(v is None for v in [title, product_id, sku_id, status, price]):
                continue

            get_shipping_url = f"https://www.samsclub.com/sams/shop/product/moneybox/shippingDeliveryInfo.jsp?zipCode={ZIP_CODE}&productId={product_id}&skuId={sku_id}&status={status}&isSelectedZip=true&isLoggedIn=true&_={TIME}"

            item = {
                "title": title,
                "status": status,
                "price": price,
                "product_id": product_id,
                "sku_id": sku_id
            }

            request = scrapy.Request(
                get_shipping_url, meta={"item": item, "cookie_jar": cookieJar}, callback=self.parse)
            cookieJar.add_cookie_header(request)
            yield request
예제 #17
0
    def parse(self, response):

        cookie_obj = CookieJar()
        cookie_obj.extract_cookies(response, response.request)
        # print('cookie get ==================')
        # print(cookie_obj)
        # print(dir(cookie_obj))
        # print(cookie_obj._cookies) # 访问cookies
        # print('cookie end ===================')
        # print(response)
        page = response.meta['page']
        next_page = page + 1
        logging.info('on parse')
        logging.info(f'next page ========== {next_page}')
        articles = response.xpath('//article[@class="excerpt"]')
        for article in articles:
            item = AsyncSandboxItem()
            category = article.xpath('./header/a[1]/text()').extract_first()
            title = article.xpath('./header/h2/a[1]/text()').extract_first()
            article_url = article.xpath(
                './header/h2/a[1]/@href').extract_first()
            item['title'] = title
            item['category'] = category
            item['article_url'] = article_url

            yield Request(url=article_url,
                          callback=self.parse_item,
                          meta={'item': item})

        if next_page < 900:
            yield Request(
                url=self.BASE_URL.format(next_page),
                meta={'page': next_page},
                # dont_filter=True
            )
예제 #18
0
    def parse(self, response):
        login_url = 'https://passport.36kr.com/passport/sign_in'
        cookie_jar = CookieJar()
        cookie_jar.extract_cookies(response, response.request)
        for m, n in cookie_jar._cookies.items():
            for m1, n1 in n.items():
                for m2, n2 in n1.items():
                    if m2 in [
                            'aliyungf_tc', 'krnewsfrontss', 'device-uid',
                            'M-XSRF-TOKEN'
                    ]:
                        self.cookie_jar[m2] = n2.value
        """
        type:login
        bind:false
        needCaptcha:false
        username:18616561846
        password:abcd.1234
        ok_url:https%3A%2F%2Frong.36kr.com%2Flist%2Fdetail%26%3FsortField%3DHOT_SCORE
        ktm_reghost:null
        """

        yield scrapy.Request(
            url=login_url,
            method='POST',
            body=
            'type=login&bind=false&needCaptcha=false&username=18616561846&password=abcd.1234&\
                    ok_url=https%3A%2F%2Frong.36kr.com%2Flist%2Fdetail%26%3FsortField%3DHOT_SCORE&ktm_reghost=null',
            headers={'Content-Type': 'application/x-www-form-urlencoded'},
            cookies=self.cookie_jar,
            callback=self.login)
예제 #19
0
파일: mail.py 프로젝트: Ricsk/for_scrapy
class MailSpider(scrapy.Spider):
    name = 'mail'
    #allowed_domains = ['mail.163.com']
    #start_urls = ['https://mail.163.com/']
    cookie_jar = CookieJar()

    def start_requests(self):
        urls = [
            'https://dl.reg.163.com/webzj/m163_2/pub/index_dl.html?wdaId=&pkid=CvViHzl&product=mail163',
            'https://dl.reg.163.com/getConf?callback=URSJSONP1524651825971&pkid=CvViHzl&pd=mail163&mode=1',
            'https://dl.reg.163.com/ini?pd=mail163&pkid=CvViHzl&pkht=mail.163.com&topURL=https%3A%2F%2Fmail.163.com%2F&nocache=1524651826394'
        ]
        for url in urls:
            yield scrapy.Request(url=url,
                                 meta={'cookiejar': 1},
                                 callback=self.parse)

    def parse(self, response):
        if response.url == 'https://dl.reg.163.com/ini?pd=mail163&pkid=CvViHzl&pkht=mail.163.com&topURL=https%3A%2F%2Fmail.163.com%2F&nocache=1524651826394':
            yield scrapy.Request(url='https://mail.163.com/',
                                 meta={'cookiejar': 1},
                                 callback=self.parse2)

    def parse2(self, response):
        pass
예제 #20
0
 def start_requests(self):
     cookie_jar = CookieJar()
     yield FormRequest(self.url0,
                       formdata=self.data0,
                       headers=self.header,
                       meta={'cookiejar': cookie_jar},
                       callback=self.parse)
예제 #21
0
class BaseRegistroCivilSpider(scrapy.Spider):
    cookie_jar = CookieJar()
    login_url = "https://transparencia.registrocivil.org.br/registral-covid"
    start_urls = []
    xsrf_token = ""

    def start_requests(self):
        yield self.make_login_request()

    def make_login_request(self):
        return scrapy.Request(
            url=self.login_url,
            callback=self.parse_login_response,
            meta={"dont_cache": True},
        )

    def make_request(self, *args, **kwargs):
        kwargs["headers"] = kwargs.get("headers", {})
        kwargs["headers"]["X-XSRF-TOKEN"] = self.xsrf_token
        return scrapy.Request(*args, **kwargs)

    def start_requests_after_login(self):
        for url in self.start_urls:
            yield self.make_request(url, callback=self.parse)

    def parse_login_response(self, response):
        self.cookie_jar.extract_cookies(response, response.request)
        self.xsrf_token = next(c for c in self.cookie_jar
                               if c.name == "XSRF-TOKEN").value

        for request in self.start_requests_after_login():
            yield request

    def parse(self):
        raise NotImplementedError()
예제 #22
0
파일: chouti.py 프로젝트: AIF333/sp
    def login(self, response):
        cookie_jar = CookieJar()
        cookie_jar.extract_cookies(response, response.request)

        # 登录获取未认证的cookie
        for k, v in cookie_jar._cookies.items():
            # print("====",v.items())
            for i, j in v.items():
                for m, n in j.items():
                    self.cookie_dict[m] = n.value
                    # print("---",j.items())

        data = {
            "phone": "8613476152416",
            "password": "******",
            "oneMonth": 1,
        }
        # 将字典转为url形式   phone=8613476152416&password=yt123456&oneMonth=1
        # print("--",urlencode(data))
        req = Request(url='https://dig.chouti.com/login',
                      method='POST',
                      headers={
                          'Content-Type':
                          'application/x-www-form-urlencoded; charset=UTF-8'
                      },
                      body=urlencode(data),
                      cookies=self.cookie_dict,
                      callback=self.check_login)
        yield req
예제 #23
0
 def __init__(self, start_url=None, history=True):
     super(Sdjnggzyjy, self).__init__()
     # jobs = start_url.split('|')
     jobs = self.start_urls
     self.cookie_jar = CookieJar()
     self.count = 0
     for job in jobs:
         self.headers = {
             'Accept application/json, text/javascript, */*;': 'q=0.01',
             # 'Accept-Encoding':'gzip,deflate',
             'Accept-Language':
             'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
             'Connection': 'keep-alive',
             'Content-Length': '90',
             'Content-Type': 'application/json',
             # 'User-Agent Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101':'Firefox/64.0',
             'Host': 'www.jnggzyjy.gov.cn',
             'Referer': job.split()[0],
             'Public-X-XSRF-TOKEN': '',
             # 'X-Requested-With':'XMLHttpRequest',
         }
         self.post_params.append({
             "url": job.split()[0],
             "ba_type": job.split()[1]
         })
     # dispatcher.connect(self.initial, signals.engine_started)
     self.history = history
예제 #24
0
    def make_cookie_reqs(self, url, payload, xss_param):
        ''' Generate payloaded cookie header requests '''

        two_rand_letters = random.choice(string.lowercase) + random.choice(
            string.lowercase)
        delim_str = self.delim + two_rand_letters
        payload = delim_str + payload + delim_str + ';9'

        reqs = [
            Request(url,
                    meta={
                        'xss_place': 'header',
                        'cookiejar': CookieJar(),
                        'xss_param': xss_param,
                        'orig_url': url,
                        'payload': payload,
                        'delim': delim_str
                    },
                    cookies={'userinput': payload},
                    callback=self.xss_chars_finder,
                    dont_filter=True)
        ]

        if len(reqs) > 0:
            return reqs
예제 #25
0
    def scrape_library(self, response):
        profile_url = response.meta['profile_url']
        hxs = HtmlXPathSelector(response)

        for item in hxs.select(
                '//div[@id="content"]/table[@id="lt_catalog_list"]/tbody/tr'):
            book = LibraryThingLibraryItem()
            book['profile_url'] = profile_url

            cells = item.select('td')
            book['work_url'] = 'http://www.librarything.com%s'\
                % cells[1].select('.//a/@href').extract()[0]
            book['work_id'] = book['work_url'].split('/')[-3]
            book['rating'] = cells[5].select('.//input/@value').extract()[0]
            book['date_added'] = str(dateutil.parser.parse(cells[6]\
                .select('.//text()') .extract()[0]).date())

            yield book

        next_page = hxs.select(
            '//div[@id="content"]/table//nobr/a[contains(text(), "next page")]'
        )
        if next_page:
            cookie_jar = response.meta.setdefault('cookie_jar', CookieJar())
            cookie_jar.extract_cookies(response, response.request)

            request = Request('http://www.librarything.com%s'\
                % next_page.select('@href').extract()[0],
                meta={'profile_url': profile_url, 'dont_merge_cookies': True,\
                'cookie_jar': cookie_jar}, callback=self.scrape_library)
            cookie_jar.add_cookie_header(request)
            yield request
예제 #26
0
 def login_callback(self, response):
     cookiejar = CookieJar()
     cookiejar.extract_cookies(response, response.request)
     self.cookiejar = cookiejar
     
     request = scrapy.http.Request(url='http://www.baiinfo.com/Search/Index?wd=%E6%95%A3%E8%A3%85%E8%BF%9B%E5%8F%A3%E6%B2%A5%E9%9D%92%E5%88%B0%E5%B2%B8%E4%BB%B7', 
         callback = self.parse_index)
     return [request]
예제 #27
0
 def get_cookies(self, res):
     '''获取登陆成功后的cookies'''
     cookie_jar = CookieJar()
     cookie_jar.extract_cookies(res, res.request)
     # cookiejar是类字典类型的,将它写入到文件中
     with open('cookies.txt', 'w') as f:
         for cookie in cookie_jar:
             f.write(str(cookie) + '\n')
예제 #28
0
 def start_requests(self):
     for url in self.start_urls:
         # 好像没cookie 不能访问
         cookie_jar = CookieJar()
         yield Request(url,
                       meta={'cookiejar': cookie_jar},
                       headers=self.head,
                       callback=self.login_in)
예제 #29
0
 def test_setitem(self):
     cookies = CookieJar()
     self.storage['new_cookies'] = cookies
     self.assertDictEqual(
         self.storage.coll.find_one({'key': 'new_cookies'}, {'_id': 0}), {
             'key': 'new_cookies',
             'cookiejar': pickle.dumps(cookies),
             'cookies': cookies._cookies
         })
예제 #30
0
def extract_to_file(session=context.http_session):
    scrapy_jar = CookieJar(policy=session.cookies.get_policy())
    for cookie in session.cookies:
        scrapy_jar.set_cookie(cookie)

    with open(cookie_name(), 'wb') as io_writer:
        # save it in a way scrapy_cookies can gather later on
        pickle.dump({None: scrapy_jar}, io_writer)
    os.chmod(cookie_name(), mode=0o600)