Пример #1
0
    def shibiecode(self,response):
        if response.status==200:
            result=json.loads(response.content)
            code=result.get('code')
            if code:
                self.push(
                    Request(url='http://wenshu.court.gov.cn/Content/CheckVisitCode', proxy=response.proxy, method='POST', meta=response.request.meta, cookies=response.request.cookies,
                            data={'ValidateCode': code}, callback='checkcode', priority=9, allow_redirects=False,allow_proxy=False))
            else:
                request = Request(url='http://wenshu.court.gov.cn/User/ValidateCode', meta=response.request.meta, proxy=response.proxy, method='GET', callback='certifycode',
                                  priority=7, allow_redirects=False,allow_proxy=False)
                self.push(request)

        pass
Пример #2
0
 def start_request(self):
     #return
     connect = pymysql.Connect(
         host='localhost',
         port=3306,
         user='******',
         passwd='Elements123',
         db=self.custom_settings['MYSQL_DBNAME'],
         charset='utf8',
         use_unicode=True
     )
     print('开始查询')
     cursor = connect.cursor()
     cursor.execute(
         "select id,canshu from a_copy where id >2000000 and count=-1 limit 10")
     connect.commit()
     result = cursor.fetchall()
     for i in result:        
         url = 'http://wenshu.court.gov.cn/ValiCode/GetCode'
         guid=self.get_guid()
         data = {'guid': guid}
         headers = {
             'Accept': '*/*',
             'Accept-Encoding': 'gzip, deflate',
             'Accept-Language': 'zh-CN,zh;q=0.9',
             'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
             'X-Requested-With': 'XMLHttpRequest'
         }
         request=Request(url=url,method='POST',headers=headers,data=data,meta={'code':i[1],'id':i[0],'guid':guid},callback='parse',priority=1,allow_proxy=True,allow_redirects=False)
         self.start_push(request)
Пример #3
0
 def start_request(self):
     for k, v in self.city_dict.items():
         for city, code in v.items():
             for wtype in self.work_type:
                 url = 'https://%s.58.com%s' % (code.split('|')[0], wtype)
                 headers = {
                     'Accept':
                     'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                     'Accept-Encoding': 'gzip, deflate, sdch, br',
                     'Accept-Language': 'zh-CN,zh;q=0.8',
                     'Upgrade-Insecure-Requests': '1',
                     'User-Agent': user_agent.random
                 }
                 request = Request(url=url,
                                   method='GET',
                                   headers=headers,
                                   meta={
                                       'city': city,
                                       'page': 1,
                                       'url': url
                                   },
                                   callback='parse',
                                   allow_redirects=False,
                                   allow_proxy=False)
                 self.start_push(request)
                 time.sleep(0.1)
Пример #4
0
 def start_request(self):
     connect = pymysql.Connect(host='localhost',
                               port=3306,
                               user='******',
                               passwd='Elements123',
                               db=self.custom_settings['MYSQL_DBNAME'],
                               charset='utf8',
                               use_unicode=True)
     print('开始查询')
     cursor = connect.cursor()
     cursor.execute("select id,url from zp_58_urls WHERE label=9")
     connect.commit()
     result = cursor.fetchall()
     for i in result:
         headers = {
             'accept':
             'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'accept-encoding': 'gzip, deflate, sdch, br',
             'accept-language': 'zh-CN,zh;q=0.8',
             'upgrade-insecure-requests': '1',
             'User-Agent': user_agent.random
         }
         request = Request(url=i[1],
                           method='GET',
                           headers=headers,
                           callback='parse',
                           priority=1,
                           meta={'id': i[0]},
                           allow_redirects=False)
         self.start_push(request)
Пример #5
0
    def im_parse(self, response):
        if response.status == 200:
            code = self.model.get_code(response.content)
            code1 = '|'.join(
                [code.get(i) for i in response.request.meta['check']])
            print(code, code1)

            params = {
                'data': code1,
                'callback':
                'jQuery111309236942442398923_%s' % time.time() * 1000,
                '_': '%s' % time.time() * 1000
            }
            headers = {
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate, sdch',
                'Accept-Language': 'zh-CN,zh;q=0.8',
                'Upgrade-Insecure-Requests': '1',
                'User-Agent': user_agent.random
            }
            request = Request(url=response.request.meta['check_url'],
                              method='GET',
                              params=params,
                              headers=headers,
                              callback='check_parse',
                              meta=response.request.meta,
                              allow_redirects=True,
                              allow_proxy=False,
                              priority=5)
            self.push(request)
            return
Пример #6
0
 def js_parse(self, response):
     if response.status == 200:
         re_tag = '请在下方的键盘中依次点击 <i>(.*)</i>'
         match = re.findall(re_tag, response.text())
         if match:
             check = [i.strip() for i in match[0].split('-')]
             headers = {
                 'Accept':
                 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                 'Accept-Encoding': 'gzip, deflate, sdch',
                 'Accept-Language': 'zh-CN,zh;q=0.8',
                 'Upgrade-Insecure-Requests': '1',
                 'User-Agent': user_agent.random
             }
             response.request.meta['check'] = check
             request = Request(url=response.request.meta['img_url'],
                               method='GET',
                               headers=headers,
                               callback='im_parse',
                               meta=response.request.meta,
                               allow_redirects=True,
                               allow_proxy=False,
                               priority=4)
             self.push(request)
             return
Пример #7
0
    def next_parse(self, response):
        if response.status == 200:
            headers = {
                'accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'accept-encoding': 'gzip, deflate, sdch, br',
                'accept-language': 'zh-CN,zh;q=0.8',
                'upgrade-insecure-requests': '1',
                'User-Agent': user_agent.random
            }

            city = response.request.meta['city']
            selector = Selector(text=response.content)
            items = selector.xpath("//dl[@class='selIndCate']/dd/span/a")
            for item in items:
                url = item.xpath("./@href").extract_first('')
                next_url = 'https:' + url + 'pn1'
                request = Request(url=next_url,
                                  method='GET',
                                  headers=headers,
                                  callback='info_parse',
                                  priority=2,
                                  allow_redirects=False,
                                  meta={
                                      'city': city,
                                      'page': 1,
                                      'model_url': 'https:' + url
                                  },
                                  allow_proxy=False)
                self.push(request)
Пример #8
0
    def start_request(self):
        #time.sleep(100)
        return
        connect = pymysql.Connect(
            host='127.0.0.1',
            port=3306,
            user='******',
            passwd='Elements123',
            db=self.custom_settings['MYSQL_DBNAME'],
            charset='utf8',
            use_unicode=True
        )
        print('开始查询')
        cursor = connect.cursor()
        cursor.execute(
            "select city from wander.baidu_zhaopin_city")

        connect.commit()
        result = cursor.fetchall()
        print('查询成功')
        for i in result:
            url='http://zhaopin.baidu.com/quanzhi?city=%s'%i[0]
            headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                       'Accept-Encoding': 'gzip, deflate, sdch',
                       'Accept-Language': 'zh-CN,zh;q=0.8',
                       'Upgrade-Insecure-Requests': '1',
                       'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
            request = Request(url=url, method='GET', headers=headers, callback='parse', priority=1,
                              allow_redirects=False, meta={'city': i[0]}, allow_proxy=True)
            self.start_push(request)
Пример #9
0
 def check_parse(self, response):
     if response.status == 200:
         result = json.loads(response.text().split('(')[-1].split(')')[0])
         code = result.get('code')
         if code:
             headers = {
                 'Accept':
                 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                 'Accept-Encoding': 'gzip, deflate, sdch',
                 'Accept-Language': 'zh-CN,zh;q=0.8',
                 'Upgrade-Insecure-Requests': '1',
                 'User-Agent': user_agent.random
             }
             params = {
                 'ez_verify_code': response.request.params['data'],
                 'ez_verify_sign': code,
                 'timestamp': '%s' % time.time() * 1000,
                 'identity': 'spider',
                 'redirect': response.request.meta['url'],
                 'scene': 'spider'
             }
             request = Request(url=response.request.meta['check_url'],
                               method='GET',
                               params=params,
                               headers=headers,
                               callback='parse',
                               meta=response.request.meta,
                               allow_redirects=True,
                               allow_proxy=False,
                               priority=5)
             self.push(request)
             return
Пример #10
0
 def cookie_parse(self,response):
     if response.status==200:
         vjkl5=response.cookies.get('vjkl5').value
         response.request.meta['vjkl5']=vjkl5
         url='http://127.0.0.1:8080/get_vl5x?cookie=%s'%vjkl5
         request = Request(url=url, method='GET',meta=response.request.meta,callback='vjkl5_parse', priority=3,allow_redirects=False,allow_proxy=False)
         self.push(request)
Пример #11
0
 def start_request(self):
     connect = pymysql.Connect(host='localhost',
                               port=3306,
                               user='******',
                               passwd='Elements123',
                               db=self.custom_settings['MYSQL_DBNAME'],
                               charset='utf8',
                               use_unicode=True)
     print('开始查询')
     cursor = connect.cursor()
     cursor.execute("select city,url from bxw_city_url")
     connect.commit()
     result = cursor.fetchall()
     for i in result:
         url = i[1]
         headers = {
             'Accept':
             'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Encoding': 'gzip, deflate, sdch',
             'Accept-Language': 'zh-CN,zh;q=0.8',
             'Upgrade-Insecure-Requests': '1',
             'User-Agent': user_agent.random
         }
         request = Request(url=url,
                           method='GET',
                           headers=headers,
                           callback='parse',
                           meta={'city': i[0]},
                           allow_redirects=True,
                           allow_proxy=False,
                           priority=1)
         self.start_push(request)
Пример #12
0
 def start_request(self):
     for i in range(1, 51):
         url = 'http://www.qzrc.com/Search.ashx?action=c&rnd=0.16554745083462485'
         data = {
             'stype': 'k',
             'p': 1,
             'k': '公司',
             'pn': '150',
             'urlfrom': 'http://www.qzrc.com/companyList.shtml',
             'ps': '25'
         }
         headers = {
             'Accept':
             'application/json, text/javascript, */*; q=0.01',
             'Accept-Encoding':
             'gzip, deflate',
             'Accept-Language':
             'zh-CN,zh;q=0.8',
             'Content-Type':
             'application/x-www-form-urlencoded; charset=UTF-8',
             'X-Requested-With':
             'XMLHttpRequest',
             'User-Agent':
             'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
         }
         request = Request(url=url,
                           method='POST',
                           headers=headers,
                           data=data,
                           callback='parse',
                           allow_proxy=False,
                           allow_redirects=False)
         self.start_push(request)
Пример #13
0
    def start_request(self):
        url = 'http://www.hxrc.com/rcnew/SeniorSearchJobInFront.aspx?SearchKind=1&KeyWord=&area='

        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
        request=Request(url=url,method='GET',headers=headers,callback='parse',allow_redirects=False,allow_proxy=False,timeout=40)
        self.start_push(request)
Пример #14
0
 def checkcode(self,response):
     if response.text()=='1':
         print(str(response.proxy)+'验证成功')
         self.push(response.request.meta.get('request_pre'))
         try:
             self.proxy_dict.pop((response.proxy or '127.0.0.1'))
         except Exception as e:
             print(e,(response.proxy or '127.0.0.1'))
     elif response.text()=='2':
         request=Request(url='http://wenshu.court.gov.cn/User/ValidateCode',meta=response.request.meta,proxy=response.proxy,method='GET',callback='certifycode',priority=7,allow_redirects=False,allow_proxy=False)
         self.push(request)
Пример #15
0
 def start_request(self):
     for i in range(11919613,1,-1):
         url = 'http://www.cjob.gov.cn/cjobs/htmls/cb21dwPages/%s.html'%i
         headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                    'Accept-Encoding': 'gzip, deflate, sdch',
                    'Accept-Language': 'zh-CN,zh;q=0.8',
                    'Upgrade-Insecure-Requests': '1',
                    'User-Agent': user_agent.random}
         request=Request(url=url,method='GET',headers=headers,callback='parse',meta={'id':i},allow_redirects=False,allow_proxy=False)
         self.start_push(request)
         time.sleep(0.02)
Пример #16
0
 def start_request(self):
     for i in range(71110,1,-1):
         url = 'http://www.scrc168.com/PersonalJobs/CompanyInfo.aspx?companyid=%s'%i
         headers = {
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Encoding': 'gzip, deflate, sdch',
             'Accept-Language': 'zh-CN,zh;q=0.8',
             'Upgrade-Insecure-Requests': '1',
             'User-Agent': user_agent.random}
         request=Request(url=url,method='GET',headers=headers,callback='parse',allow_redirects=False,allow_proxy=False)
         self.start_push(request)
         time.sleep(0.1)
Пример #17
0
 def start_request(self):
     for i in range(50016110,51397988):
         url = 'http://shanghai.baicai.com/company/%s/'%i
         headers = {
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Encoding': 'gzip, deflate, sdch',
             'Accept-Language': 'zh-CN,zh;q=0.8',
             'Cookie': 'baicai_p=0ghp5s5hruu3lv6ndhfub367g4828; baicai_u=0gejb5b66ih6r4s8iujeieikj3272; baicai_uID=p_15643299; baicai_area=shanghai; PHPSESSID=q2f4brenaqmotmenl6k5q6npo7; bdshare_firstime=1540355107811; baicai_s=hcnr00t3r4ma2f4l67067njkb7; __utmt=1; BC_VisitCookie=61; BC_VisitNum=61; Hm_lvt_2cb4ec3f3a8343adb1703d1115ec562b=1540353836; Hm_lpvt_2cb4ec3f3a8343adb1703d1115ec562b=1540380599; __utma=104663071.661557839.1540353837.1540353837.1540380084.2; __utmb=104663071.25.10.1540380084; __utmc=104663071; __utmz=104663071.1540353837.1.1.utmcsr=hao123.com|utmccn=(referral)|utmcmd=referral|utmcct=/zhaopin/wangzhi',
             'Upgrade-Insecure-Requests': '1',
             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
         request=Request(url=url,method='GET',headers=headers,callback='parse',allow_redirects=True,allow_proxy=False,timeout=60)
         self.start_push(request)
         time.sleep(0.02)
Пример #18
0
    def start_request(self):
        #return
        for i in range(417972,700000):
            url = 'http://www.cjol.com/jobs/company-%s'%i
            headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                     'Accept-Encoding':'gzip, deflate, sdch',
                     'Accept-Language':'zh-CN,zh;q=0.8',
                     'Upgrade-Insecure-Requests':'1',
                     'User-Agent': user_agent.random}

            request=Request(url=url,method='GET',headers=headers,callback='parse',meta={'id':i},allow_proxy=False,allow_redirects=False)
            self.start_push(request)
            time.sleep(0.03)
Пример #19
0
 def start_request(self):
     url = 'https://www.nbrc.com.cn/baseApp/app/search/job'
     data = {
         'pageNumber': 1,
         'pageSize': 20,
         'lieBieIds': '',
         'name': '',
         'jobTypeId': '',
         'cityId': '',
         'salaryId': '',
         'xingZhiId': '',
         'gongLingIds': '',
         'xueLiId': '',
         'guiMoId': '',
         'order': ''
     }
     headers = {
         'Accept':
         'application/json, text/javascript, */*; q=0.01',
         'Accept-Encoding':
         'gzip, deflate, br',
         'Accept-Language':
         'zh-CN,zh;q=0.8',
         'Content-Type':
         'application/x-www-form-urlencoded; charset=UTF-8',
         'jsonType':
         'jsonType',
         'keyid':
         'bb5259528637fe5466a8d77128dd01c2',
         'nbrcafter':
         'd5e6332262e2426f810677d6abb191c9',
         'nbrcbefore':
         '1540447660000',
         'nbrctoken':
         '',
         'Referer':
         'https://www.nbrc.com.cn/job/list.html',
         'X-Requested-With':
         'XMLHttpRequest',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
     }
     request = Request(url=url,
                       method='POST',
                       headers=headers,
                       data=data,
                       callback='parse',
                       allow_proxy=False,
                       allow_redirects=False)
     self.start_push(request)
Пример #20
0
 def parse(self,response):
     if response.status==200:
         number=response.text()
         response.request.meta['number']=number
         guid=response.request.meta['guid']
         headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                  'Accept-Encoding':'gzip, deflate, sdch',
                  'Accept-Language':'zh-CN,zh;q=0.8',
                  'Upgrade-Insecure-Requests':'1',
                  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}
         url='http://wenshu.court.gov.cn/list/list/?sorttype=1&number={}&guid={}&conditions=searchWord+%EF%BC%882012%EF%BC%89%E8%A1%8C%E7%9B%91%E5%AD%97%E7%AC%AC154-1%E5%8F%B7+AH++%E6%A1%88%E5%8F%B7:%EF%BC%882012%EF%BC%89%E8%A1%8C%E7%9B%91%E5%AD%97%E7%AC%AC154-1%E5%8F%B7'.format(number,guid)
         request = Request(url=url, method='GET', meta=response.request.meta,headers=headers,callback='cookie_parse', priority=2,allow_redirects=False,allow_proxy=True)
         self.push(request)
     pass
Пример #21
0
    def start_request(self):
        for i in range(2140000,1,-1):
            url = 'http://www.hbsc.cn/ashx/Corp/GetContact.ashx?id=%s&_=0.7466502927798564'%i

            headers = {
                'Accept': 'application/json, text/javascript, */*; q=0.01',
                'Accept-Encoding': 'gzip, deflate, sdch',
                'Accept-Language': 'zh-CN,zh;q=0.8',
                'X-Requested-With': 'XMLHttpRequest',
                'Referer': 'http://www.hbsc.cn/corp/corpinfo-%s.html'%i,
                'User-Agent': user_agent.random}
            request=Request(url=url,method='GET',headers=headers,callback='parse',allow_redirects=False,allow_proxy=False,timeout=40)
            self.start_push(request)
            time.sleep(0.03)
Пример #22
0
 def start_request(self):
     for i in range(1,11654):
         url = 'http://zhaogong.chinalao.com/%s/'%i
         headers = {
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Encoding': 'gzip, deflate, sdch',
             'Accept-Language': 'zh-CN,zh;q=0.8',
             'X-Requested-With': 'XMLHttpRequest',
             'Referer': 'http://zhaogong.chinalao.com/4/',
             'Upgrade-Insecure-Requests':'1',
             'User-Agent': user_agent.random}
         request=Request(url=url,method='GET',headers=headers,callback='parse',allow_proxy=False,allow_redirects=False)
         self.start_push(request)
         time.sleep(0.1)
Пример #23
0
 def start_request(self):
     #return
     for i in range(1847270,2100000):
         url = 'http://www.stzp.cn/jw/showent_%s.aspx'%i
         headers = {
             'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Encoding':'gzip, deflate, sdch',
             'Accept-Language':'zh-CN,zh;q=0.8',
             'Cookie':'GeolocationTimeOutName_2=1; Geolocation_1=%7B%22bczp_CityCN%22%3A%22%u6DC4%u535A%22%2C%22bczp_citycode%22%3A291600%2C%22city%22%3A%22%u6DC4%u535A%u5E02%22%2C%22geodist%22%3Anull%2C%22geodist_int%22%3A0%2C%22lat%22%3A36.80468485%2C%22lng%22%3A118.05913428%2C%22province%22%3A%22%u5C71%u4E1C%u7701%22%2C%22reftime%22%3A%222018-10-24%2014%3A38%3A33%22%2C%22street%22%3A%22%22%2C%22street_number%22%3A%22%22%7D; hidePtAD_1=1; ASP.NET_SessionId=ynldhjyeobvt4prquncvgylo; route=c8088b91cb0f2fbcbdf107bd31e3d195; UM_distinctid=166a49655510-025ae9911e2cdb-474f0820-1fa400-166a49655528a6; bdshare_firstime=1540359621817; Jw_UserName=bczp78663707d; Jw_PassWord=qaw0%2b7P4aWPp0ju05uA%2bDw%3d%3d; Admin_SN=0; Jw_LogIP=218.247.217.98; EntSearchCookies=%cf%fa%ca%db; Hm_lvt_9c09fb6bb32d4dafc6fd4ec18d310d5b=1540359607; Hm_lpvt_9c09fb6bb32d4dafc6fd4ec18d310d5b=1540437667; CNZZDATA49160=cnzz_eid%3D374406922-1540359096-null%26ntime%3D1540434052; bchatjw7866370=0',
             'Upgrade-Insecure-Requests':'1',
             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
         request=Request(url=url,method='GET',headers=headers,callback='parse',allow_redirects=False,allow_proxy=False,timeout=40)
         self.start_push(request)
         time.sleep(0.05)
Пример #24
0
 def parse(self, response):
     if response.status == 200:
         result = json.loads(response.content)
         datas = result.get('data')
         items = datas.get('list')
         for item in items:
             id = item.get('id')
             url = 'https://www.nbrc.com.cn/baseApp/app/job/getJobDetail'
             data = {'jobId': id}
             headers = {
                 'Accept':
                 'application/json, text/javascript, */*; q=0.01',
                 'Accept-Encoding':
                 'gzip, deflate, br',
                 'Accept-Language':
                 'zh-CN,zh;q=0.8',
                 'Content-Type':
                 'application/x-www-form-urlencoded; charset=UTF-8',
                 'jsonType':
                 'jsonType',
                 'keyid':
                 'bb5259528637fe5466a8d77128dd01c2',
                 'nbrcafter':
                 'd5e6332262e2426f810677d6abb191c9',
                 'nbrcbefore':
                 '1540447660000',
                 'nbrctoken':
                 '',
                 'Referer':
                 'https://www.nbrc.com.cn/job/list.html',
                 'X-Requested-With':
                 'XMLHttpRequest',
                 'User-Agent':
                 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
             }
             request = Request(url=url,
                               method='POST',
                               headers=headers,
                               data=data,
                               callback='info_parse',
                               allow_proxy=False,
                               allow_redirects=False)
             self.push(request)
         pager = datas.get('page')
         totalPage = pager.get('totalPage')
         if response.request.data['pageNumber'] < totalPage:
             response.request.data['pageNumber'] += 1
             self.push(response.request)
     pass
Пример #25
0
 def start_request(self):
     for i in range(626207, 0, -1):
         url = 'https://m.jobcn.com/wxapp/getComInfo.ujson?m.comId=%s' % i
         data = {'page': i}
         headers = {'User-Agent': user_agent.random}
         request = Request(url=url,
                           method='GET',
                           headers=headers,
                           data=data,
                           callback='parse',
                           allow_proxy=False,
                           allow_redirects=False,
                           timeout=30)
         self.start_push(request)
         time.sleep(0.06)
Пример #26
0
    def parse(self,response):
        salary_items=['0_0','1_1999','2001_2999','3001_4999','5001_7999','8001_9999','10000_99999999']
        date_items=self.get_date(1)
        if response.status==200:
            print({'BAIDUID':response.cookies.get('BAIDUID').value})
            token=''
            selector=Selector(text=response.text())
            re_tag="window.zp_pc_nekot = '(.*?)';"
            match=re.findall(re_tag,response.text())
            if match:
                aa=list(match[0])
                aa.reverse()
                token=''.join(aa).replace('\\','')
            else:
                self.push(response.request)
                return
            area_items=selector.xpath("//span[@class='areaitem']/text()").extract()
            for area in area_items:
                for salary in salary_items:
                    for date in date_items:
                        url='http://zhaopin.baidu.com/api/qzasync'
                        params={'query':'',
                                'city':response.request.meta['city'],
                                'is_adq':'1',
                                'pcmod':'1',
                                'district':area,
                                'sort_type':'1',
                                'sort_key':'5',
                                'pn':0,
                                'rn':10,
                                'token':token,
                                'salary':salary,
                                'date':date}
                        headers = {
                            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                            'Accept-Encoding': 'gzip, deflate, sdch',
                            'Accept-Language': 'zh-CN,zh;q=0.8',
                            'Upgrade-Insecure-Requests': '1',
                            'Referer':str(response.url),
                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
                        
                        request = Request(url=url, method='GET',cookies={'BAIDUID':response.cookies.get('BAIDUID').value}, params=params,headers=headers, callback='info_parse', priority=2,
                                          allow_redirects=False,allow_proxy=True)
                        self.push(request)

        pass
Пример #27
0
    def lasted_parse(self,response):
        if response.status == 200:
            if b'remind key' in response.content:
                print('remind key')
                pass
            if b'"remind"' == response.content:
                if not self.proxy_dict.get((response.proxy or '127.0.0.1')):
                    print((response.proxy or '127.0.0.1'),'出现验证码')
                    headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                               'Accept-Encoding': 'gzip, deflate, sdch',
                               'Accept-Language': 'zh-CN,zh;q=0.8',
                               'Upgrade-Insecure-Requests': '1',
                               'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}
                    response.request.meta['request_pre']=response.request
                    request = Request(url='http://wenshu.court.gov.cn/User/ValidateCode',
                                      meta=response.request.meta, method='GET', callback='certifycode',headers=headers, priority=7, proxy=response.proxy,allow_proxy=False,
                                      allow_redirects=False)
                    self.proxy_dict[(response.proxy or '127.0.0.1')] = time.time()
                    self.push(request)
                else:
                    self.push(response.request)

            else:
                list_content = json.loads(json.loads(response.text()))
                try:
                    RunEval = list_content[0].get('RunEval')
                except Exception as e:
                    print(e,list_content,response.request.meta['code'])
                    self.push(response.request)
                    return
                count = list_content[0].get('Count')
                for i in list_content[1:]:
                    nopublish_reason = i.get('不公开理由')
                    jgdge_cx = i.get('审判程序')
                    wenshu_id = i.get('文书ID')
                    aj_name = i.get('案件名称')
                    aj_type = i.get('案件类型')
                    aj_code = i.get('案号')
                    court_name = i.get('法院名称')
                    judge_date = i.get('裁判日期')
                    judge_brief = i.get('裁判要旨段原文')
                    insert_sql='''insert into court_docid(aj_code,RunEval,wenshu_id,aj_name,aj_type,court_name,judge_date,judge_brief,nopublish_reason,jgdge_cx) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
                    self.Pipeline.process_item((insert_sql, (aj_code,RunEval,wenshu_id,aj_name,aj_type,court_name,judge_date,judge_brief,nopublish_reason,jgdge_cx)))
                if response.request.data['Index']*response.request.data['Page']<int(count) and response.request.data['Index']<20:
                    response.request.data['Index']+=1
                    self.push(response.request)
Пример #28
0
    def parse(self, response):
        if response.status != 200:
            print(response.status, response.url)
        if response.status == 200:
            selector = Selector(text=response.content)
            if '系统检测到异常行为,请先进行九宫格验证' in response.text():
                im_url = selector.xpath(
                    "//script[contains(@src,'verify.baixing.com.cn/')]/@src"
                ).extract_first('')
                headers = {
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                    'Accept-Encoding': 'gzip, deflate, sdch',
                    'Accept-Language': 'zh-CN,zh;q=0.8',
                    'Upgrade-Insecure-Requests': '1',
                    'User-Agent': user_agent.random
                }
                response.request.meta['img_url'] = im_url.replace(
                    '.js', '.jpg')
                response.request.meta['check_url'] = im_url.replace(
                    '.js', '.valid')
                response.request.meta['url'] = response.request.url
                request = Request(url=im_url,
                                  method='GET',
                                  headers=headers,
                                  callback='js_parse',
                                  meta=response.request.meta,
                                  allow_redirects=True,
                                  allow_proxy=False,
                                  priority=3)
                self.push(request)
                return
            items = selector.xpath("//a[@class='ad-title']/@href").extract()
            for i in items:
                insert_sql = '''insert ignore into bxw_url_new(city,url) VALUES (%s,%s)'''
                self.Pipeline.process_item(
                    (insert_sql, (response.request.meta['city'], i)))

            next_url = selector.xpath("//a[text()='下一页']/@href").extract_first(
                '')
            if next_url:
                response.request.url = urljoin(response.request.url, next_url)
                response.request.priority = 3
                self.push(response.request)

        pass
Пример #29
0
 def start_request(self):
     url = 'https://qy.58.com/citylist/'
     headers = {
         'accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
         'accept-encoding': 'gzip, deflate, sdch, br',
         'accept-language': 'zh-CN,zh;q=0.8',
         'upgrade-insecure-requests': '1',
         'User-Agent': user_agent.random
     }
     request = Request(url=url,
                       method='GET',
                       headers=headers,
                       callback='parse',
                       priority=1,
                       allow_redirects=False,
                       allow_proxy=False)
     self.start_push(request)
Пример #30
0
 def start_request(self):
     url = 'http://jobs.newjobs.com.cn/Jobs/SearchResult?name='
     headers = {
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
         'Accept-Encoding': 'gzip, deflate, sdch',
         'Accept-Language': 'zh-CN,zh;q=0.8',
         'Upgrade-Insecure-Requests': '1',
         'User-Agent': user_agent.random
     }
     request = Request(url=url,
                       method='GET',
                       headers=headers,
                       callback='parse',
                       allow_proxy=False,
                       allow_redirects=False,
                       timeout=50)
     self.start_push(request)