def not_person_info(self, response): now_person = json.loads(response.text) person_info = { 'companyName': response.meta['company_name'], 'licenseNum': response.meta['number'], 'area': '江西省', 'sex': '' } for i in now_person['rows']: # 人员名称 person_info['name'] = i['name'] # 联系电话 person_info['tel'] = i['mobileNum'] # 身份证 person_info['idCard'] = i['idNumber'] # 职称专业 if len(i['jobTitleCertInfo']) != 0: # 职称 try: person_info['grade'] = i['titleLevel']['name'] except KeyError as e: person_info['grade'] = '' # 职称专业 person_info['major'] = i['jobTitleCertInfo'][0][ 'specificTitleMajor'] # 证书编号 person_info['num'] = i['jobTitleCertInfo'][0][ 'certificateNumber'] # 发证时间 try: # 有效期 c = time.localtime( int(i['jobTitleCertInfo'][0]['issuedDt'] / 1000)) use_time = time.strftime("%Y-%m-%d", c) use_time = str(use_time) person_info['validTime'] = use_time except KeyError as e: person_info['validTime'] = '' person_info['regNum'] = '' person_info['tokenKey'] = self.token print('非人员信息%s' % person_info) yield scrapy.FormRequest( url= 'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm', formdata=person_info, callback=self.person_zz, meta={'company_name': response.meta['company_name']}, dont_filter=True) else: # 岗位 if i['positionCertInfos']: print('我是%s--非注册人员--公司是%s---' % (i['name'], response.meta['company_name'])) try: person_info['grade'] = i['positionCertInfos'][0][ 'positionType']['name'] except IndexError as e: person_info['grade'] = '' # 证书编号 try: person_info['num'] = i['positionCertInfos'][0][ 'certificateNumber'] except IndexError as e: person_info['num'] = '' # 有效期 try: # 有效期 c = time.localtime( int(i['positionCertInfos'][0]['expiryDt'] / 1000)) use_time = time.strftime("%Y-%m-%d", c) use_time = str(use_time) person_info['validTime'] = use_time except KeyError as e: person_info['validTime'] = '' person_info['regNum'] = '' person_info['major'] = '' person_info['tokenKey'] = self.token print('非人员信息%s' % person_info) yield scrapy.FormRequest( url= 'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm', formdata=person_info, callback=self.person_zz, meta={'company_name': response.meta['company_name']}, dont_filter=True) else: person_info['grade'] = '' person_info['major'] = '' person_info['validTime'] = '' person_info['num'] = '' person_info['regNum'] = '' person_info['tokenKey'] = self.token yield scrapy.FormRequest( url= 'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm', formdata=person_info, callback=self.person_zz, meta={'company_name': response.meta['company_name']}, dont_filter=True)
def start_requests(self): yield scrapy.FormRequest(self.url, formdata=self.formdata)
def sv11_tableau_cvi(self, response): self.log('sv11_tableau_cvi') if os.environ.get('PRODOUANE_DEBUG'): with open("debug/sv11_07_tableau.html", 'wb') as f: f.write(response.body) info = [] nb_docs = 0 if (response.css('.dr-table-headercell')): nb_docs = int(response.css('.dr-table-headercell').re(r'\((\d+)\)')[0]) if (nb_docs): for tr in response.css('#formDeclaration tbody tr'): if (tr.css('td')[3].css('a::attr(id)')): idhtml = tr.css('td')[3].css('a::attr(id)')[0].extract() cvi = tr.css('td::text')[1].extract() date = 'XXXX' if (len(tr.css('td::text')[2].extract().split('/')) > 1): date = tr.css('td::text')[2].extract().split('/')[2] #telechargement que si CVI specifie self.log("new line : {date: %s, cvi: %s, idhtml: %s}"% (date, cvi, idhtml)) info.append({'date': date, 'cvi': cvi, 'idhtml': idhtml}) if (not len(response.meta['cvi'])): print("new cvi found : sv11 "+cvi) args = self.get_input_args(response, '#formFiltre') id = response.meta['id'] if (not len(response.meta['cvi'])) : self.log('id %s : %d (%d)' % ('NO MORE CVI', id, len(info)) ) if (len(info) == 30) and (nb_docs > (30 * (response.meta['page'] + 1))) : response.meta['page'] = response.meta['page'] + 1 myargs = { 'javax.faces.ViewState' : args['javax.faces.ViewState'], 'formDeclaration:_link_hidden_':'', 'formDeclaration:listeDeclaration:scrollerId': '%d' % (response.meta['page'] + 1), 'formDeclaration_SUBMIT':"1", 'autoScroll':"0,0", } response.meta['id'] = 0 yield scrapy.FormRequest(url='https://www.douane.gouv.fr/ncvi-web-sv11-prodouane/jsp/accueilOrganisme.jsf', formdata=myargs, callback=self.sv11_tableau_cvi, meta=response.meta) else: response.meta['page'] = 0 response.meta['id'] = 0 response.meta['commune'] = response.meta['commune'] + 1 if (response.meta['nb_communes'] <= response.meta['commune']): response.meta['departement'] = response.meta['departement'] + 1 response.meta['commune'] = 0 if (response.meta['nb_departements'] > response.meta['departement']): yield scrapy.FormRequest(url='https://www.douane.gouv.fr/ncvi-web-sv11-prodouane/jsp/accueilOrganisme.jsf?commune=%d&dep=%d' % (response.meta['commune'], response.meta['departement']), callback=self.sv11_accueil, meta=response.meta) elif (len(info) > id): self.log('id %s : %d (%d)' % (info[id]['cvi'], id, len(info)) ) i = info[id] myargs = { 'javax.faces.ViewState' : args['javax.faces.ViewState'], 'formDeclaration:_link_hidden_':'', 'formDeclaration:_idcl': i['idhtml'], 'formDeclaration_SUBMIT':"1", 'autoScroll':"0,0", } response.meta['id'] = id response.meta['info'] = info yield scrapy.FormRequest(url='https://www.douane.gouv.fr/ncvi-web-sv11-prodouane/jsp/accueilOrganisme.jsf', formdata=myargs, callback=self.sv11_html_sv11, meta=response.meta) else: self.log('no document found for %s' % response.meta['cvi'])
def start_requests(self): self.formdata['province'] = str(next(self.provinces)) yield scrapy.FormRequest(url=self.query_url, formdata=self.formdata, callback=self.parse)
def start_requests(self): for url in self.start_urls: yield scrapy.FormRequest(url=url, cookies=self.cookies, callback=self.parse)
def start_requests(self): # start_requests 不能与Rule规则同时使用 return [scrapy.FormRequest('https://www.douban.com/accounts/login', formdata={'form_email': 'email', 'form_password': '******'}, callback=self.loged_in)]
def parse_data(self, response): try: trs = response.xpath('//tr[contains(@bgcolor, "#ffffff")]') if len(trs) > 0: #open_in_browser(response) self.logger.info("fd = %s" % str(response.meta['fd'])) for tr in trs: trx = Selector(text=tr.extract()) row = trx.xpath('//td') if len(row) == 0: return td2 = Selector(text=row[2].extract().replace( '\r\n', '').replace('\t', '')) itemType = td2.xpath('//b//text()').extract_first() if itemType == '아파트': auctionDate = re.findall('\d{4}\.\d{2}\.\d{2}', td2.extract()) td3 = Selector(text=row[3].extract().replace( '\r\n', '').replace('\t', '')) addr = td3.extract()[td3.extract().index('<br>') + 4:td3.extract().index('<!--')] auctionLoc = re.findall('>(\S*계)', td3.extract()) id = re.findall("pop_detail\('(.+)',", td3.extract())[0] self.logger.info("{},{},{}".format(id, addr, itemType)) fd2 = {"idcode": '{}'.format(id)} yield scrapy.FormRequest(self.detail_url, callback=self.parse_detail, formdata=fd2) pages = re.findall("javascript:submit_chk\('(\d)'\);", response.text) if len(pages) > 0: self.logger.info("paging = %s" % str(pages)) #open_in_browser(response) fd = response.meta['fd'] nowPage = int(str(fd['nowPge'])) linkPage = int(str(pages[-1])) if linkPage > nowPage: fd['nowPge'] = str(nowPage + 1) #self.logger.info("paging = %s -> %s" % (str(nowPage), fd['nowPge'])) r = scrapy.FormRequest(self.base_url, callback=self.parse_data, formdata=fd) r.meta['fd'] = fd yield r else: self.logger.info("blank") except: open_in_browser(response)
def start_requests(self): return [ scrapy.FormRequest("http://www.gewara.com/cinema/searchOpi.xhtml", cookies={'citycode': '320100'}, callback=self.parse_movie) ]
def start_requests(self): url = 'https://gateway.chotot.com/v1/public/web-proxy-api/loadRegions' yield scrapy.FormRequest(url=url, method='GET', headers=self.headers_city, callback=self.parse_district)
def parse(self, response): def default_item(xpath_value): try: return response.xpath(xpath_value).extract_first().strip() except AttributeError: return '' try: EVENTTARGET = default_item( '//a[@class="consultationSummaryBtn"]/@href').split("'")[1] except (TypeError, IndexError): EVENTTARGET = '' VIEWSTATEFIELDCOUNT = default_item( '//input[@id="__VIEWSTATEFIELDCOUNT"]/@value') VIEWSTATE = default_item('//input[@id="__VIEWSTATE"]/@value') VIEWSTATE1 = default_item('//input[@id="__VIEWSTATE1"]/@value') VIEWSTATEGENERATOR = default_item( '//input[@id="__VIEWSTATEGENERATOR"]/@value') SCROLLPOSITIONX = '0' SCROLLPOSITIONY = '0' EVENTVALIDATION = default_item( '//input[@id="__EVENTVALIDATION"]/@value') ctl22_ctl00_ddBoards = 'http://www.medicalboard.gov.au/' ctl22_ddBoards = ctl22_ctl00_ddBoards ItemId = default_item( '//input[@name="content_0$contentcolumnmain_0$rptPastConsultations$ctl00$hdnConsultationItemId"]/@value' ) DetailedContent = default_item( '//input[@name="content_0$contentcolumnmain_0$rptPastConsultations$ctl00$hdnDetailedContent"]/@value' ) Submissions = default_item( '//input[@name="content_0$contentcolumnmain_0$rptPastConsultations$ctl00$hdnSubmissions"]/@value' ) yield scrapy.FormRequest( url=self.start_urls[0], formdata={ '__EVENTTARGET': EVENTTARGET, '__VIEWSTATEFIELDCOUNT': VIEWSTATEFIELDCOUNT, '__VIEWSTATE': VIEWSTATE, '__VIEWSTATE1': VIEWSTATE1, '__VIEWSTATEGENERATOR': VIEWSTATEGENERATOR, '__SCROLLPOSITIONX': SCROLLPOSITIONX, '__SCROLLPOSITIONY': SCROLLPOSITIONY, '__EVENTVALIDATION': EVENTVALIDATION, 'ctl22$ctl00$ddBoards': ctl22_ctl00_ddBoards, 'ctl22$ddBoards': ctl22_ddBoards, 'content_0$contentcolumnmain_0$rptPastConsultations$ctl00$hdnConsultationItemId': ItemId, 'content_0$contentcolumnmain_0$rptPastConsultations$ctl00$hdnDetailedContent': DetailedContent, 'content_0$contentcolumnmain_0$rptPastConsultations$ctl00$hdnSubmissions': Submissions, '__EVENTARGUMENT': '', '__LASTFOCUS': '', '__VIEWSTATEENCRYPTED': '', 'ctl22$ctl00$ucSearch$txtSearch': '', 'ctl22$ucSearch$txtSearch': '', }, headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded', 'Host': 'www.medicalboard.gov.au', 'Origin': 'http://www.medicalboard.gov.au', 'Referer': 'http://www.medicalboard.gov.au/News/Past-Consultations.aspx', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36', }, callback=self.parse_comments)
def start_requests(self): yield scrapy.FormRequest(self.get_url, method='GET', formdata=self.data, callback=self.parse_list) pass
def start_requests(self): yield scrapy.FormRequest(self.url, formdata=self.formdata, headers=self.headers, callback=self.parse)
def start_requests(self): """ 爬虫默认接口,启动方法 :return: """ # 获取爬取时传过来的参数 # start_time: 开始时间 # end_time: 结束时间 # start_page: 开始页 (优先于start_time) # end_page: 结束页 (优先于end_time) # stop_item: 连续遇到[stop_item]个重复条目后,退出本次爬取 # spider_name: 指定的spider_name,如果不指定,使用self.name # command example: # nohup python3 -m scrapy crawl ccgp_guizhou_spider -a start_time="2019:01:01" -a end_time="2020:02:25" > /dev/null& # py -3 -m scrapy crawl base_spider -a start_time="now" -a end_time="now" # py -3 -m scrapy crawl base_spider -a start_time="now" -a end_time="now" -a start_page="700" -a end_page="1000" -a stop_item="10000" assert self.start_time is not None assert self.end_time is not None self.crawl_mode = CrawlMode.REAL_TIME if str(self.start_time).lower() == 'now' else CrawlMode.HISTORY if self.crawl_mode == CrawlMode.HISTORY: if (len(self.start_time) != 10 or len(self.end_time) != 10 or self.start_time[4] != ':' or self.end_time[4] != ':'): logging.error('Bad date format start_time:[{}] end_time:[{}]. Example: 2019:01:01'.format( self.start_time, self.end_time)) return else: # 取当天日期 _dt = datetime.fromtimestamp(time.time()) self.start_time = _dt.strftime("%Y:%m:%d") self.end_time = self.start_time # 初始化self.crawl_helper self.init_crawl_helper() # 主要配置项 _source_info = { # 页面的key,保证唯一 'page_1': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '采购需求公告', 'notice_type_code': '0201', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1153332561072666', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, 'page_2': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '采购公告', 'notice_type_code': '0201', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1153418052184995', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, 'page_3': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '更正公告', 'notice_type_code': '0204', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1153454200156791', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, 'page_4': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '废标公告', 'notice_type_code': '0204', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1153488085289816', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, 'page_4': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '中标(成交)公告', 'notice_type_code': '0202', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1153531755759540', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, 'page_5': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '单一来源公告', 'notice_type_code': '0201', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1153567415242344', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, 'page_7': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '单一来源(成交)公告', 'notice_type_code': '0202', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1153595823404526', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, # 市县标讯 'page_8': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '单一来源(成交)公告', 'notice_type_code': '0202', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1153796890012888', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, 'page_9': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '采购需求公告', 'notice_type_code': '0201', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1153796890012888', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, 'page_10': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '采购公告', 'notice_type_code': '0201', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1153796890012888', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, 'page_11': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '采购公告', 'notice_type_code': '0201', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1153797950913584', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, 'page_12': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '更正公告', 'notice_type_code': '0204', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1153817836808214', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, 'page_13': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '废标公告', 'notice_type_code': '0202', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1153845808113747', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, 'page_14': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '中标(成交)公告', 'notice_type_code': '0202', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1153905922931045', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, 'page_15': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '单一来源公示', 'notice_type_code': '0201', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1153924595764135', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, 'page_16': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '单一来源(成交)公示', 'notice_type_code': '0202', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1153937977184763', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, 'page_17': { # 通常会被填充在'source'字段里,有时也可以放在'tos' 'name': '贵州省政府采购网', # list页面的base地址 'base_url': 'http://www.ccgp-guizhou.gov.cn/article-search.html', # list页面的call_back处理函数 'callback': self.parse_list_page_common, 'method': "post", 'requests_type': "html", # 得到下一页url的函数,返回值一定是一个url 'get_next_page_url': self.get_normal_next_page_url, # 网站中该页面的最大页数,(可选配置,仅为优化程序执行效率,可不填) 'stop_page_num': 1000, # 连续遇到[stop_dup_item_num]个重复条目后,停止本次抓取 # 提示:在程序运行初始阶段,此值可以设的较大,以便爬取所有的历史记录 'stop_dup_item_num': 500000 if self.crawl_mode == CrawlMode.HISTORY else 60, # list页面中,获得条目列表的xpath 'xpath_of_list': '//div[@class="xnrx"]/ul/li', # 获得每一个条目链接地址的xpath 'xpath_of_detail_url': './a/@href', # 对每一个条目进行解析,返回CommonRawItem的类,需要实现 'item_parse_class': BaseItemCommonParser, # 其它信息,可以辅助生成CommonRawItem的字段 # 参考函数parse_list_page_common() 中 item_parser.get_common_raw_item()代码 'tos': '政府采购', 'tos_code': '02', 'source': '贵州省政府采购网', 'notice_type': '资格预审公告', 'notice_type_code': '0201', 'site_name': '贵州省政府采购网', 'area_code': '52', 'content_code': '1', 'industryName': '', 'category.id': '1156071132710859', 'time_type': 6 if self.crawl_mode == CrawlMode.HISTORY else 0, }, } logging.info('start crawling...') # 轮询每个类别 for _k, _v in _source_info.items(): # 填充爬取的基本信息 self.crawl_helper.init_crawl_info(_k, _v) # 假定每个类别有不超过100000个页面 for _page_num in range(100000): # 轮询公告中的不同list页面 if self.crawl_helper.get_stop_flag(_k): break # 根据获得下一页的函数,得到下一页的URL _request_url = _v['get_next_page_url'](page_index=_page_num, base_url=_v['base_url']) # _request = "" # 生成request if _v["method"] == "post": _payload = { 'siteId': "1", 'category.id': _v["category.id"], 'areaName': "", 'tenderRocurementPm': "", 'keywords': "", 'articlePageNo': str(_page_num + 1), 'articlePageSize': "15" } _request = scrapy.FormRequest(url=_request_url, formdata=_payload, callback=_v['callback']) else: _request = scrapy.Request(_request_url, callback=_v['callback']) # 如果需要js渲染,需要使用下面的函数 # _request = SplashRequest(_request_url, callback=_v['callback'], args={'wait': 2}) # 填充必要的参数 _request.meta['param'] = _v _request.meta['crawl_key'] = _k _request.meta['page_index'] = _page_num + 1 yield _request # 单个类别的爬取结束 self.crawl_helper.stop_crawl_info(_k) logging.info('stop crawling...')
def parse(self, response): re = response # print(response.text) # 处理数据 # 订单id order_id = re.xpath( '//*[@id="order-form"]/div[1]/div[2]/div[1]/div[1]/p/text()' ).extract()[0] # 物流单号 tracking_number = re.xpath( '//*[@id="order-form"]/div[5]/div[2]/div[2]/div[1]/input/@value' ).extract()[0] # 订单状态 order_st = re.xpath( '//*[@id="order-form"]/div[1]/div[2]/div[1]/div[2]/p/text()' ).extract()[0] # 马帮发货时间 expresstime = re.xpath( '//*[@id="order-form"]/div[1]/div[2]/div[8]/div[2]/input/@value' ).extract()[0] # 马帮订单id mb_orderid = re.xpath( '//*[@id="order-form"]/div[1]/div[1]/input[1]/@value').extract()[0] # print(mb_orderid) # print(order_st) if order_st == '已发货': # 传输数据 mb_meta = { 'order_id': order_id, 'tracking_number': tracking_number, 'order_st': order_st, 'expresstime': expresstime, 'mb_orderid': mb_orderid } # 获取合并订单的sku url = 'https://aamz.mabangerp.com/index.php?mod=order.findrelevantinfo' headers = { # "Accept": "application/json, text/javascript, */*; q=0.01", # "Accept-Encoding": "gzip, deflate, br", # "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", # "Cache-Control": "no-cache", # "Connection": "keep-alive", # "Content-Type": "application/json; charset=UTF-8", # "Host": "aamz.mabangerp.com", # "Content-Length": "", # "X-Requested-With": "XMLHttpRequest", # "Referer": "https://aamz.mabangerp.com/index.php?mod=order.detail&platformOrderId=0O43LJNW&orderStatus=2&orderTable=2&tableBase=2&cMKey=MABANG_ERP_PRO_MEMBERINFO_LOGIN_191565&lang=cn", # 注意user-agent不要出现空格 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0", } # post请求数据 data1 = {'orderId': mb_orderid, 'type': '1', 'tableBase': '2'} # cookies数据 cookies = "gr_user_id=493499a8-83fc-4e47-87c3-08b1ded6df3c; MULTI_LANGUAGE_TYPE=%2BYjZ6oacL7xJ%2FKOcmBg9Z7cTOqi7UgOUgujRs4KQ4Ms%3D; lang=cn; stock_show_product_data_cookie=ico-minus-circle; stock_data_js_cookie_is_change_weight=1; mabang_lite_rowsPerPage=500; stock_data_js_cookie_is_change_name=1; order_data_js_cookie_orderErrorbysVal=paidTime; order_data_js_cookie_orderErrorbydacname=orderByspaidTime; order_data_js_cookie_orderErrorbydacnameval=down; order_data_js_cookie_isSyn=2; employ_rows_per_page_data_cookie=50; order_data_js_cookie_isImmediately=1; signed=222014_00f6735cc675f0abb6f483d9913f72bf; PHPSESSID=gjgkl12ntct9knahgq66qtlks1; event_rember12_222014=0; CRAWL_KANDENG_KEY=K6uqW0ZkQEouz0n1adoI%2FWqfFs2PbJ8%2BCpQKvtnzAvWpTX174VXBmq5L9cDOSOj%2Bm2IcDf7pRauH34yzR4OEyw%3D%3D; loginLiteCookie=a%3A2%3A%7Bs%3A8%3A%22username%22%3Bs%3A6%3A%22222014%22%3Bs%3A9%3A%22passsword%22%3Bs%3A32%3A%22f1c7edfb07a416030a0f976bac902add%22%3B%7D" cookies = { i.split("=")[0]: i.split("=")[1] for i in cookies.split("; ") } # yield scrapy.Request(url=url, cookies=self.cookies, headers=headers, meta=mb_meta, callback=self.parse2) yield scrapy.FormRequest(url=url, cookies=cookies, formdata=data1, headers=headers, meta=mb_meta, callback=self.detail_parse) else: pass
def start_requests(self): for i in range(1, self.page + 1): data = { "pageNum": "{}".format(i) } yield scrapy.FormRequest(self.start_urls[0].format(i), headers=self.headers, formdata=data)
import os
def start_requests(self): request_url = 'https://maoyan.com/films' return [scrapy.FormRequest(request_url, callback=self.parse_movie)]
def parse_item_1(self, response): response_url = response.url print('1.response_url:', response_url) response_status = response.status print('1状态码为:', response_status) if response_status == 403: captcha = response.xpath("//img[@class='yzm-pic']/@src").extract() print('captcha', captcha) if len(captcha) > 0: # '''此时有验证码''' # 人工输入验证码 print("正在保存验证码图片") captchapicfile = "/Users/ozintel/Tsl_exercise/znfw_crawer/lawtime_married_family/data/captcha1.png" # urlopen = urllib.URLopener() # 下载图片流 ssl._create_default_https_context = ssl._create_unverified_context with request.urlopen(captcha[0]) as fp: data = fp.read() # 清除并以二进制写入 f = open(captchapicfile, 'wb') f.write(data) f.close() # captcha = input('第%s次遇到验证码,请处理验证码:\n' % (self.state_count)) # print('已经输入验证码,继续抓取:') # power_key = ''.join( response.xpath( '//div[@class="regform-box"]/form[@name="reform"]/input[1]/@value' ).extract()) captcha = input('*********captcha请输入验证码:\n') con_value = ''.join( response.xpath( '//div[@class="regform-box"]/form[@name="reform"]/input[2]/@value' ).extract()) formdata = { 'vgcode': captcha, 'power_key': power_key, # 'servertype': '10', # 'requestmode': 'async', 'continue': con_value } print('formdata', formdata) self.state_count = self.state_count + 1 # 分析源代码表单所得而不是network中看到的,那个不行 # yield FormRequest.from_response(response, # url='http://ipfilter.lsurl.cn/index.php?m=Home&c=IpFilter&a=submit_verification', # meta={"cookiejar": response.meta["cookiejar"]}, # headers=self.headers, # formdata=formdata, # callback=self.parse_page, # ) yield scrapy.FormRequest( url= 'http://ipfilter.lsurl.cn/index.php?m=Home&c=IpFilter&a=submit_verification', headers=self.headers, formdata=formdata, # callback=self.parse_page callback=self.parse_item_1) # yield scrapy.FormRequest( # url=url, # formdata={"email": "xxx", "password": "******"}, # callback=self.parse_page # ) # with open(r'/Users/ozintel/Tsl_exercise/znfw_crawer/lihun/data/law.html', 'wb') as f: # f.write(response.body) # # very_code = input('第%s次遇到验证码,请处理验证码:\n' % (self.state_count)) # print('已经输入验证码,继续抓取:') else: # with open(r'/Users/ozintel/Tsl_exercise/znfw_crawer/tutorial2/data/law.html','wb') as f: # body=response.body # print(body) # f.write(body) item = DmozItem() #在item.py中定义好的,该字典将会被返回给pipeline调用 links = response.xpath( '//ul[@class="list-main"]/li/div/a/@href').extract() print('每一页中的内容数为:', len(links)) for link in links: # print(link) yield response.follow(url=link, callback=self.parse_item_2)
def start_requests(self): start_urls = 'http://cdfy.chinacourt.gov.cn/article/search/content_time_publish_begin/2002-01-01/content_time_publish_end/2030-03-03/article_category_id//content_author//keyword/%E4%B8%8D%E5%BF%98%E5%88%9D%E5%BF%83%E3%80%81%E7%89%A2%E8%AE%B0%E4%BD%BF%E5%91%BD/button/%E6%8F%90%E4%BA%A4/page/1.shtml' yield scrapy.FormRequest(start_urls,callback=self.parse, headers = self.Headers, cookies = self.cookies)
def parse(self, response): response_status = response.status print('0状态码为:', response_status) response_url = response.url print('0.response_url:', response_url) if response_status == 403: captcha = response.xpath("//img[@class='yzm-pic']/@src").extract() print('captcha', captcha) if len(captcha) > 0: # '''此时有验证码''' # 人工输入验证码 print("正在保存验证码图片") captchapicfile = "/Users/ozintel/Tsl_exercise/znfw_crawer/lawtime_married_family/data/captcha.png" # urlopen = urllib.URLopener() # 下载图片流 ssl._create_default_https_context = ssl._create_unverified_context with request.urlopen(captcha[0]) as fp: data = fp.read() # 清除并以二进制写入 f = open(captchapicfile, 'wb') f.write(data) f.close() print('开始写入response.body', response.body) with open( r'/Users/ozintel/Tsl_exercise/znfw_crawer/lawtime_married_family/data/A_1.html', 'wb') as f: f.write(response.body) # captcha = input('第%s次遇到验证码,请处理验证码:\n' % (self.state_count)) # print('已经输入验证码,继续抓取:') # power_key = ''.join( response.xpath( '//div[@class="regform-box"]/form[@name="reform"]/input[1]/@value' ).extract()) captcha = input('*********captcha请输入验证码:\n') con_value = ''.join( response.xpath( '//div[@class="regform-box"]/form[@name="reform"]/input[2]/@value' ).extract()) formdata = { 'vgcode': captcha, 'power_key': power_key, # 'servertype': '10', # 'requestmode': 'async', 'continue': con_value } print('formdata', formdata) self.state_count = self.state_count + 1 #分析源代码表单所得而不是network中看到的,那个不行 # yield FormRequest.from_response(response, # url='http://ipfilter.lsurl.cn/index.php?m=Home&c=IpFilter&a=submit_verification', # meta={"cookiejar": response.meta["cookiejar"]}, # headers=self.headers, # formdata=formdata, # callback=self.parse_page, # ) yield scrapy.FormRequest( url= 'http://ipfilter.lsurl.cn/index.php?m=Home&c=IpFilter&a=submit_verification', headers=self.headers, formdata=formdata, # callback=self.parse_page callback=self.parse) # time.sleep(5) else: # self.item_count = response.xpath('//div[@class="paging paging-a"]//span/text()').re('\d+')[0] # self.all = int(self.item_count) + self.all # print('********************', self.item_count, self.all) # uri_list=response.xpath('//div[@class="list-block h100p o-h"]//dd//a/@href').extract() #这个response总是有的 uri_list = response.xpath( '(//div[@class="paging paging-a"]/a/@href)[position()<last()]' ).extract() count_link = len(uri_list) print('每个网页中页码数为:', count_link) # if count_link==0 and response_status!=403: # print('哈哈哈哈哈') # raise CloseSpider() # if count_link!=0 or response_status==403: url_first = re.sub(r'http://www.lawtime.cn', '', response_url) uri_list.append(url_first) for uri in uri_list: uri_1 = 'http://www.lawtime.cn/' + uri # print(uri_1) yield response.follow(url=uri_1, callback=self.parse_item_1) self.month_1 = self.month_1 + 1 if self.month_1 == 13: self.year_1 = self.year_1 + 1 self.month_1 = 1 m = '0' + str(self.month_1) ym = (self.year_1, m) # print(self.year_1, m) else: if self.month_1 <= 9: m = '0' + str(self.month_1) ym = (self.year_1, m) # print(self.year_1, m) else: ym = (self.year_1, self.month_1) # print(year_1, month_1) if self.year_1 == 2017 and self.month_1 > 11: raise CloseSpider() # print('ym',ym) uri_2 = "http://www.lawtime.cn/ask/browse_s91_d%s%s.html" % ym # http: // www.lawtime.cn / ask / browse_s4_p13.html # uri_2="http://www.lawtime.cn/ask/browse_s4_p%s.html" %(self.page) # "http://www.lawtime.cn/ask/browse_s4_p1.html" # item = DmozItem() # 在item.py中定义好的,该字典将会被返回给pipeline调用********** # item['type'] = '%s' %page # *********** # yield item # yield scrapy.Request(url=uri_2,meta={"cookiejar":1}, callback=self.parse) yield scrapy.Request(url=uri_2, callback=self.parse)
def start_requests(self): return [ scrapy.FormRequest("http://txdai.com/", headers=self.user_agent) ]
def start_requests(self): yield scrapy.FormRequest( url=self.endpoint, method='GET', formdata=self.query, callback=self.parse )
def start_requests(self): ### TODO: figure out what to do about dates DATES = '2017_12_09_2017_12_10' URL = "https://www.tripadvisor.com/Hotels" headers = { 'Accept': 'text/javascript, text/html, application/xml, text/xml, */*', 'Accept-Encoding': 'gzip,deflate', 'Accept-Language': 'en-US,en;q=0.5', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', 'Host': 'www.tripadvisor.com', 'Pragma': 'no-cache', 'Referer': '', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36.', 'X-Requested-With': 'XMLHttpRequest' } form_data = { 'adults': '2', 'dateBumped': 'NONE', 'displayedSortOrder': 'popularity', 'geo': '', 'hs': '', 'isFirstPageLoad': 'false', 'rad': '0', 'refineForm': 'true', 'requestingServlet': 'Hotels', 'rooms': '1', 'scid': 'null_coupon', 'searchAll': 'false', 'seen': '150', 'sequence': '7', 'o': "0", 'staydates': DATES } cookies = {"SetCurrency": "USD"} # read from the necessary intermediate URLs with open("intermediate/urls.csv") as f: reader = csv.reader(f) for line in reader: url = urljoin('http://www.tripadvisor.com', line[0]) geo = line[1] headers['Referer'] = url form_data['geo'] = geo yield scrapy.FormRequest(url=URL, method='POST', formdata=form_data, cookies=cookies, headers=headers, callback=self.parse, meta={ 'seen': '0', 'url': url })
def start_requests(self): yield scrapy.FormRequest(self.start_urls[0], formdata=self.form_data, callback=self.homepage_parse)
def parse(self, response): print("## Beginning to parse ", self.crtPage, " page") crtTr = 0 item = BacItem() self.errors = 0 if self.started: for tr in response.xpath( '(//table[@class="mainTable"]/tr/td[@class="tdBac"])'): if crtTr == None: continue if crtTr % 31 == 0: item['nr'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 1: item['nume'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 2: item['posIerarhieJudet'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 3: item['posIerarhieTara'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 4: item['unitInvatamant'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 5: item['judet'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 6: item['promotieAnterioara'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 7: item['formaEducatie'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 8: item['specializare'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 9: item['examenOralRomana'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 10: item['notaScrisaRomana'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 11: item['notaContestatieRomana'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 12: item['notaFinalaRomana'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 13: item['limbaMaterna'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 14: item['limbaModerna'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 15: item['notaLimbaModerna'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 16: item['disciplinaObligatorie'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 17: item['disciplinaAlegere'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 18: item['competenteDigitale'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 19: item['medie'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 20: item['rezultatFinal'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 21: item['competenteMaterna'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 22: item['notaScrisaMaterna'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 23: item['notaContestatieMaterna'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 24: item['notaFinalaMaterna'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 25: item['notaDisciplinaObligatorie'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 26: item[ 'notaContestatieDisciplinaObligatorie'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 27: item['notaFinalaDisciplinaObligatorie'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 28: item['notaDisciplinaAlegere'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 29: item['notaContestatieDisciplinaAlegere'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 30: item['notaFinalaDisciplinaAlegere'] = BeautifulSoup( tr.extract().encode("utf-8")).get_text() if crtTr % 31 == 0 and crtTr != 0: yield item crtTr += 1 if self.started: print("## Parsing page ", self.crtPage, " ended") #Go to the next page if not self.started: self.crtPage = self.startPage self.started = True else: self.crtPage += 1 if (self.crtPage <= self.endPage): while True: print("## Trying to jump to next page: ", self.crtPage) try: if response.css( 'input#__VIEWSTATE::attr(value)').extract_first( ) != None and response.css( 'input#__VIEWSTATEGENERATOR::attr(value)' ).extract_first() != None and response.css( 'input#__EVENTVALIDATION::attr(value)' ).extract_first() != None: self._viewState = response.css( 'input#__VIEWSTATE::attr(value)').extract_first() self._viewGenerator = response.css( 'input#__VIEWSTATEGENERATOR::attr(value)' ).extract_first() self._eventValidation = response.css( 'input#__EVENTVALIDATION::attr(value)' ).extract_first() yield scrapy.FormRequest( 'http://bacalaureat.edu.ro/Pages/TaraRezultAlfa.aspx', formdata={ 'ctl00$ContentPlaceHolderBody$DropDownList2': str(self.crtPage), '__VIEWSTATE': self._viewState, '__VIEWSTATEGENERATOR': self._viewGenerator, '__EVENTVALIDATION': self._eventValidation }, callback=self.parse, dont_filter=True) break except Exception as e: print("Error when loading page ", self.crtPage, e) self.errors += 1 if self.errors >= 2: exit(0) time.sleep(25)
def parse(self, response): url_done = [] urls = [ 'https://www.amazon.com/Haier-HC17SF15RB-Refrigerator-Freezer-Qualified/dp/B00N142GLI?_encoding=UTF8&psc=1' ] #url='http://www.upcbarcodes.com/wp-admin/admin-ajax.php' # print input_seller_name for seller_url in urls: if seller_url not in url_done: requsturl = scrapy.FormRequest( 'https://www.amazon.com/gp/delivery/ajax/address-change.html', headers={ 'Origin': 'https://www.amazon.com', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Accept': 'text/html,*/*', 'Referer': 'https://www.amazon.com/', 'X-Requested-With': 'XMLHttpRequest', 'Connection': 'keep-alive' }, formdata={ 'zipCode': '60629', 'locationType': 'LOCATION_INPUT', 'deviceType': 'web', 'pageType': 'Detail' }, callback=self.getupc, method="POST") #print requsturl.body requsturl.cookies = { 'aws-target-static-id': '1452239187641-159842', 'aws-business-metrics-last-visit': '1460349972900', '__utmv': '194891197.%22QDSe8l%404pyTIl%3FpKm5C24aEFXeBtLGw3BhDIGikRUeXlFGLshyp4Dtw4gLRG%3F9cU%22', 'aws-userInfo': '%7B%22arn%22%3A%22arn%3Aaws%3Aiam%3A%3A111320495319%3Aroot%22%2C%22alias%22%3A%22%22%2C%22username%22%3A%22rahul%2520bhaskar%22%2C%22keybase%22%3A%22%22%2C%22issuer%22%3A%22https%3A%2F%2Fwww.amazon.com%2Fap%2Fsignin%22%7D', 's_pers': '%20s_vnum%3D1880352564702%2526vn%253D3%7C1880352564702%3B%20s_invisit%3Dtrue%7C1470309348410%3B%20s_nr%3D1470307548416-Repeat%7C1478083548416%3B', '__utma': '194891197.372951375.1452236845.1470290622.1470307550.17', 'aws-ubid-main': '182-7331780-4611541', '_mkto_trk': 'id:112-TZM-766&token:_mch-aws.amazon.com-1484112318467-15791', 'x-amz-captcha-1': '1494506850389641', 'x-amz-captcha-2': '7TNj5/ZBUiQE8Q7M1TGIGw==', 'aws-target-visitor-id': '1452239187643-451048.20_19', 'aws-target-data': '%7B%22support%22%3A%221%22%7D', 's_fid': '70673D38D5DFE123-1B689FC000FE2EFF', 's_vn': '1515648318007%26vn%3D7', 'regStatus': 'registered', 'x-wl-uid': '1YbHUe7z4Q16UzYOKnza7nF0Z8c60AUse7MqEp+CAv+wdJamSRB88EpQjCOb5Xsg9wS/EFz0+hhSbAl3qbMeh7dWiD1jtJRDs/6R5VxAFk6LzV16+6hZ0Cz+uIpt9TzsXS7IGe2aDx3Q=', 'sst-main': 'Sst1|PQGX_RwjQAxLFI_BwdTV0Q4UCL8-RIlysfyKrjYoFGe3oqm9lnuttlbX-lGX4weSExupeA7cYB3Zb0CSGU91LcK9xa8Av4IeMWfbcMKAV4AXqvCSM7S-SXJJpEWQhn0AsaJNc4wwxVVQzrZRhD4jVmdocyJewDAfSRGF1SSTgg_cvNGYGZx8-WqW1z-bekrkDEc-ZrMz9f9Ii077rpcz7Q0tBrE5xr2htKXdWZUzmT4ZSBqkJ9NlatkaEU7sYxBuyl0LadTT6wmYRPPfHnJzSQYdUQ', 'ubid-main': '156-9680828-0484351', 'ca': 'ALAAAAAAEAAAAAQGAAEIAUQ=', 's_vnum': '1926421318258%26vn%3D2', 's_nr': '1514281824850-New', 's_dslv': '1514281824851', 'session-id': '144-3935774-8062208', 'session-token': '"n2d7o5bJUB480T+okCcD+Qgte5eb6+XVoWrh4WzA8cPLcyI8v4G8hDqqoR2uWyzLBg4ETAaFwIQ6lGxkm9Hx8EmSQMVq4In0q2pXM0KD/1jNBUtqnPJf5WRZb/xRJGL2mIv58UxYLpLX0e1wf6XYjtrHfPcAOONchcbeZIpAXZOil1fCyrFDBgE3AmUSvlFNadxFHlRhG6IUrSJ/W7TAEw=="', 'session-id-time': '2082787201l', 'csm-hit': '%7B%22tb%22%3A%227099AAE54J4R4DQXDH89%2Bs-7099AAE54J4R4DQXDH89%7C1515471521353%22%2C%22adb%22%3A%22adblk_no%22%7D' } time.sleep(1) yield requsturl
def parse_company_list(self, response): for company_item in response.xpath('//div[@class="jie_nei"]/ul/li/a'): info = json.loads( company_item.xpath('@onclick').re_first( r'company0(.*?);').replace('(', '[').replace(')', ']').replace( '\'', '\"')) company_code, info_no, zj = info column_id = response.meta['columnid'] company = ExposureCompanyItem.get_company(column_id, company_code, info_no) # 合作中介列表 yield scrapy.FormRequest( url='http://icid.iachina.cn/ICID/front/viewAllZJ.do', method='POST', formdata={ 'columnid': column_id, 'internetInformationNo': info_no, 'informationno': info_no, 'zj': zj }, meta={'company': company}, callback=self.parse_cooperation_list, dont_filter=True) # 合作中介列表 历史 yield scrapy.FormRequest( url='http://icid.iachina.cn/ICID/front/viewAllZJHis.do', method='POST', formdata={ 'columnid': column_id, 'internetInformationNo': info_no, 'informationno': info_no, 'zj': zj }, meta={'company': company}, callback=self.parse_cooperation_list, dont_filter=True) # 合作第三方列表 yield scrapy.FormRequest( url='http://icid.iachina.cn/ICID/front/viewAllSecond.do', method='POST', formdata={ 'columnid': column_id, 'internetInformationNo': info_no, 'informationno': info_no, 'zj': zj }, meta={'company': company}, callback=self.parse_cooperation_list, dont_filter=True) # 合作第三方列表 历史 yield scrapy.FormRequest( url='http://icid.iachina.cn/ICID/front/viewAllSecondHis.do', method='POST', formdata={ 'columnid': column_id, 'internetInformationNo': info_no, 'informationno': info_no, 'zj': zj }, meta={'company': company}, callback=self.parse_cooperation_list, dont_filter=True)
def start_requests(self): url = 'http://www.sxt.cn/index/login/login.html' formdata = {"user": "******", "password": "******"} yield scrapy.FormRequest(url, formdata=formdata, callback=self.parse)
def parse(self, response): lastpage = int(response.xpath("//select[@id='Page']/option[last()]/@value")[0].extract()) for page in xrange(1, lastpage + 1): yield scrapy.FormRequest(response.url, callback=self.parse_songs, formdata={'Page': str(page)})
def person_info(self, response): now_person = json.loads(response.text) for n in now_person['rows']: person_info = { 'companyName': response.meta['company_name'], 'licenseNum': response.meta['number'], 'area': '江西省', 'sex': '', 'idCard': '', 'major': '', 'phone': '', 'tokenKey': self.token, 'name': n['name'] } # 人员名称 # print(n['name']) print('我是%s----公司是%s' % (n['name'], response.meta['company_name'])) # 证书编号 try: person_info['num'] = n['registrationInfo'][0][ 'regCertificateNumber'] except KeyError as e: person_info['num'] = '' # 注册类别 person_info['grade'] = n['registrationInfo'][0]['registerType'][ 'name'] # 注册专业 try: person_info['major'] = n['registrationInfo'][0][ 'qualificationRegMajors'][0]['name'] except KeyError as e: person_info['major'] = '' # 执业印章号 print(n['registrationInfo'][0]['qualificationCertNumber']) person_info['regNum'] = n['registrationInfo'][0][ 'qualificationCertNumber'] # 发证机关 ---待续 # 证件有效时间 try: print(n['registrationInfo'][0]['registrationDt']) c = time.localtime( int(n['registrationInfo'][0]['registrationDt'] / 1000)) use_time = time.strftime("%Y-%m-%d", c) use_time = str(use_time) person_info['validTime'] = use_time except KeyError as e: person_info['validTime'] = '' # print(person_info) print('注册人员信息%s' % person_info) yield scrapy.FormRequest( url= 'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm', formdata=person_info, callback=self.person_zz, meta={'company_name': response.meta['company_name']}, dont_filter=True)