def test_splash_form_request(): req = SplashFormRequest('http://example.com', formdata={'foo': 'bar'}) assert req.method == 'POST' assert req.body == b'foo=bar' assert req.meta['splash']['args']['url'] == 'http://example.com' req = SplashFormRequest('http://example.com', method='GET', formdata={'foo': 'bar'}, endpoint='execute') assert req.method == 'GET' assert req.body == b'' assert req.url == req.meta['splash']['args']['url'] ==\ 'http://example.com?foo=bar' assert req.meta['splash']['endpoint'] == 'execute'
def parse_field(self, response): selector = Selector(text=response.body) fieldList = selector.xpath('//li') degreeId = response.meta['degreeId'] if len(fieldList): for field in fieldList: item = fieldItem() fieldId = field.xpath('@id').extract()[0] fieldName = field.xpath('./text()').extract()[0] fieldName = re.sub('\ue6a2', '', fieldName) item['id'] = fieldId item['name'] = fieldName item['degreeId'] = degreeId yield item for field in fieldList: fieldId = field.xpath('@id').extract()[0] # self.logger.debug(degreeId) yield SplashFormRequest(self.base_major_url, formdata={'method': 'subCategoryMl', 'key': fieldId}, callback=self.parse_subject, meta={'fieldId': fieldId} )
def parse_degree(self, response): soup = BeautifulSoup(response.body, 'lxml') container = soup.find('div', attrs={'class': 'zyk-list'}) degreeContainer = container.find('ul', attrs={'class': 'zyk-cc-ul'}) degreeList = degreeContainer.findAll('li') for degree in degreeList: item = degreeItem() degreeId = degree.attrs['id'] degreeName = degree.text degreeName = re.sub('\ue6a2', '', degreeName) item['id'] = degreeId item['name'] = degreeName yield item for degree in degreeList: degreeId = degree.attrs['id'] # self.logger.debug(degreeId) yield SplashFormRequest(self.base_major_url, formdata={'method': 'subCategoryMl', 'key': degreeId}, callback=self.parse_field, meta={'degreeId': degreeId} )
def login_me(self, response): RequestVerificationToken = response.selector.xpath( "//form[@id='login-form']//input[@name='__RequestVerificationToken']/@value" ).get("") # get the Captcha's options sitekey = response.selector.xpath( "//div[@class='g-recaptcha']/@data-sitekey").get("") gcaptcha_txt = self.solve_captcha(sitekey, response.url) if not gcaptcha_txt: return frm_data = { 'Empresa': '1', 'Email': self.e_mail, 'Senha': self.senha, 'g-recaptcha-response': gcaptcha_txt, '__RequestVerificationToken': RequestVerificationToken } print(frm_data) yield SplashFormRequest(self.start_url, formdata=frm_data, callback=self.get_main_page, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={'lua_source': script_10_sec_wait}, dont_filter=True)
def parse_subject(self, response): selector = Selector(text=response.body) subjectList = selector.xpath('//li') fieldId = response.meta['fieldId'] if len(subjectList): for subject in subjectList: item = subjectItem() subjectId = subject.xpath('@id').extract()[0] subjectName = subject.xpath('./text()').extract()[0] subjectName = re.sub('\ue6a2', '', subjectName) item['id'] = subjectId item['name'] = subjectName item['fieldId'] = fieldId yield item for subject in subjectList: subjectId = subject.xpath('@id').extract()[0] yield SplashFormRequest(self.base_major_url, formdata={'method': 'subCategoryXk', 'key': subjectId}, callback=self.parse_major, meta={'subjectId': subjectId} )
def login_me(self, response): form_inputs = response.selector.xpath( "//form[.//input[@value='Entrar']]//input") frm_data = {} for inpt in form_inputs: inpt_name = inpt.xpath("./@name").get("") inpt_val = inpt.xpath("./@value").get("") if "urlRedirectLogin" in inpt_name: inpt_val = "/wps/portal/portaldetran/cidadao/infracoes/servicos/consultaMultas" elif "modalMensagem" in inpt_name: inpt_val = "Para realizar a pesquisa de débitos e restrições de veículos do proprietário,<br /> responda algumas perguntas, acesse com seu CPF e senha ou cadastre-se abaixo:" elif "numeroLogin" in inpt_name: inpt_val = self.cpf_cnpj elif "senhaLogin" in inpt_name: inpt_val = self.senha frm_data.update({inpt_name: inpt_val}) url = "http://www.detran.sp.gov.br" + frm_data['javax.faces.encodedURL'] # yield FormRequest(url, formdata=frm_data, callback=self.set_renavam, dont_filter=True) yield SplashFormRequest(url, formdata=frm_data, callback=self.set_renavam, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={'lua_source': script_10_sec_wait}, dont_filter=True)
def start_requests(self): # 请求方式 # print(self.start_urls) for url in self.start_urls: if not self.dynamic: if self.method.lower() == 'post': request = FormRequest(url=url, formdata=self.form, headers=self.headers, cookies=self.cookies, callback=self.parse_first, dont_filter=True) else: request = Request(url=url, headers=self.headers, cookies=self.cookies, callback=self.parse_first) else: if self.method.lower() == 'post': request = SplashFormRequest(url=url, formdata=self.form, callback=self.parse_first, dont_filter=False, args=self.args_data) else: request = SplashRequest(url, callback=self.parse_first, endpoint='execute', args=self.args_data) yield request
def make_request_from_data(self, data): """ :param data: redis_key中的数据 :return: 生成 scrapy请求 """ scheduled = ScheduledRequest( **json.loads(bytes_to_str(data, self.redis_encoding))) callback, dont_filter = self.get_callback(scheduled.callback) if not callable(callback): raise OSError(f"{scheduled.callback}没有指定回调函数") params = { 'url': scheduled.url, 'method': scheduled.method, 'meta': scheduled.meta, 'dont_filter': dont_filter, 'callback': callback } if 'splash' in scheduled.meta: wait = scheduled.meta.get('splash').get('wait', 2) images = scheduled.meta.get('splash').get('images', 0) # 默认不下载图片 params['args'] = {'wait': wait, 'images': images} if scheduled.method == "POST": return SplashFormRequest(formdata=scheduled.body, **params) else: return SplashRequest(**params) if scheduled.method == "POST": return FormRequest(formdata=scheduled.body, **params) else: return Request(**params)
def start_requests(self): frm_data = {"email": self.e_mail, "password": self.senha} login_url = self.start_url + '/login/sign_in' yield SplashFormRequest(login_url, formdata=frm_data, callback=self.sign_in_me, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={'lua_source': script_10_sec_wait}, dont_filter=True)
def parse(self, response): # meta['splash']['args'] 包含了发往Splash的参数。 # meta['splash']['endpoint'] 指定了Splash所使用的endpoint,默认是render.html # meta['splash']['splash_url'] 覆盖了settings.py文件中配置的Splash URL # meta['splash']['splash_headers'] 运行你增加或修改发往Splash服务器的HTTP头部信息,注意这个不是修改发往远程web站点的HTTP头部 # meta['splash']['dont_send_headers'] 如果你不想传递headers给Splash,将它设置成True # meta['splash']['slot_policy'] 让你自定义Splash请求的同步设置 # meta['splash']['dont_process_response'] 当你设置成True后,SplashMiddleware不会修改默认的scrapy.Response请求. # 默认是会返回SplashResponse子类响应比如SplashTextResponse # meta['splash']['magic_response'] 默认为True,Splash会自动设置Response的一些属性,比如response.headers, response.body等 # SplashFormRequest使用 yield SplashFormRequest(response.url, self.next_parse, formdata={'name': '111'})
def set_renavam(self, response): error_message = response.selector.xpath( "//ul[contains(@class,'alert-error') and not(@style)]/li/span/text()" ).get("") if error_message: error_msg = { "error_type": "WRONG_CREDENTIALS", "details": error_message } self.errors.append(error_msg) self.logger.warning(error_msg) return # get the Captcha's options sitekey = response.selector.xpath( "//div[@class='g-recaptcha']/@data-sitekey").get("") gcaptcha_txt = self.solve_captcha(sitekey, response.url) if not gcaptcha_txt: return renavam_form = response.selector.xpath( "//form[.//td[contains(text(),'Renavam')]]") form_name = renavam_form.xpath("./@name").get("") frm_data = { "{}:_idcl".format(form_name): form_name.replace("form", "btAvancar"), 'g-recaptcha-response': gcaptcha_txt } form_inputs = renavam_form.xpath(".//input") for inpt in form_inputs: inpt_name = inpt.xpath("./@name").get("") inpt_val = inpt.xpath("./@value").get("") if ":Renavam" in inpt_name: inpt_val = self.renavam frm_data.update({inpt_name: inpt_val}) url = "http://www.detran.sp.gov.br" + renavam_form.xpath( "./@action").get("") yield SplashFormRequest(url, formdata=frm_data, callback=self.get_main_page, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={ 'lua_source': script_10_sec_wait, 'cookies': response.data['cookies'] }, dont_filter=True)
def parse_affiliates(self, response): countries for country in countries: yield SplashFormRequest( url='https://www.crossfit.com/affiliate-list', formxpath= "//div[@class='form-group']/select[@id='countryFilter']", formdata={'option': country}) paises = response.xpath("//table[@id='affiliateTable']/tbody/tr") for pais in paises: yield { 'gym name': country.xpath('.//td/a/text()').get(), 'local': country.xpath('.//td/text()').get(), 'country': country }
def login_me(self, response): url = 'http://www.sefaz.ba.gov.br/scripts/ipva/dae/VeiculoCadastrado/ipva_texto_obter_desconto200.asp' frm_data = {'txt_renavam': self.renavam, 'txt_renavam1': ''} #yield FormRequest(url, formdata=frm_data, callback=self.get_main_page, # errback=self.errback_func, dont_filter=True) yield SplashFormRequest(url, formdata=frm_data, callback=self.get_main_page, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={ 'lua_source': script, 'cookies': response.data['cookies'] }, dont_filter=True)
def get_debito_calculado_ipva(self, response): valor_do_ipva = response.selector.xpath( "//font[contains(.,'Pagamento de cota')]/text()").get("").strip() ano_exercicio = response.selector.xpath( "//span[contains(.,'Ano Exercício')]/../input/@value").get( "").strip() data_do_vencimento = response.selector.xpath( "//span[contains(.,'Data do Vencimento')]/../input/@value").get( "").strip() valor_da_cota_unica = response.selector.xpath( "//span[contains(.,'Valor da Cota única')]/../input/@value").get( "").strip() print(valor_da_cota_unica) if valor_da_cota_unica: row_data = { 'valor_do_ipva': valor_do_ipva, 'ano_exercicio': ano_exercicio, 'data_do_vencimento': data_do_vencimento, 'valor_da_cota_unica': valor_da_cota_unica } if self.get_files: url = "http://www.sefaz.ba.gov.br/scripts/ipva/dae/VeiculoCadastrado/result_dae_avulso_ipva.asp" frm_data = { 'Lnum_cnpj_cpf_base': '', 'Lnum_cnpj_cpf_filial': '', 'Lnum_cnpj_cpf_digito': '' } #yield FormRequest(url, callback=self.test_file, # errback=self.errback_func, dont_filter=True) yield SplashFormRequest( url, formdata=frm_data, callback=self.print_html_to_pdf, #errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={ 'lua_source': script, 'cookies': response.data['cookies'] }, meta={'row_data': row_data}, dont_filter=True) else: ipva_do_veiculo = self.result.get('ipva_do_veiculo', []) ipva_do_veiculo.append(row_data) self.result.update({'ipva_do_veiculo': ipva_do_veiculo})
def get_login_page(self, response): """Function to get request options to login. Used to get ReCaptcha token; image captcha value.""" # get the Captcha's options sitekey = response.selector.xpath( "//div[@class='g-recaptcha']/@data-sitekey").get("") gcaptcha_txt = self.solve_captcha(sitekey, response.url) if not gcaptcha_txt: return # Get options for request EVENTTARGET = response.selector.xpath( "//input[@id='__EVENTTARGET']/@value").get("") EVENTARGUMENT = response.selector.xpath( "//input[@id='__EVENTARGUMENT']/@value").get("") VIEWSTATE = response.selector.xpath( "//input[@id='__VIEWSTATE']/@value").get("") VIEWSTATEGENERATOR = response.selector.xpath( "//input[@id='__VIEWSTATEGENERATOR']/@value").get("") EVENTVALIDATION = response.selector.xpath( "//input[@id='__EVENTVALIDATION']/@value").get("") frm_data = { '__EVENTTARGET': EVENTTARGET, '__EVENTARGUMENT': EVENTARGUMENT, '__VIEWSTATE': VIEWSTATE, '__VIEWSTATEGENERATOR': VIEWSTATEGENERATOR, '__EVENTVALIDATION': EVENTVALIDATION, 'ctl00$conteudoPaginaPlaceHolder$txtRenavam': self.renavam, 'ctl00$conteudoPaginaPlaceHolder$txtPlaca': self.placa, 'g-recaptcha-response': gcaptcha_txt, 'ctl00$conteudoPaginaPlaceHolder$btn_Consultar': 'Consultar' } yield SplashFormRequest(self.start_url, formdata=frm_data, callback=self.login_me, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={ 'lua_source': script_10_sec_wait, 'cookies': response.data['cookies'] }, dont_filter=True)
def start_requests(self): tickers = [ '600001', '000777', ] for ticker in tickers: # try all tickers one by one if ticker[0] == '0': data = { # prepare for the requrest 'stock': ticker, 'searchkey': '年年度报告', 'category': 'category_ndbg_szsh', 'pageNum': '1', 'pageSize': '30', 'column': 'szse_main', # for Shenzhen stock exchange 'tabName': 'fulltext', 'sortName':'', 'sortType':'', 'limit': '', 'seDate': '', } elif ticker[0] == '6': data = { # prepare for the requrest 'stock': ticker, 'searchkey': '年年度报告', 'category': 'category_ndbg_szsh', 'pageNum': '1', 'pageSize': '30', 'column': 'sse', # for Shanghai stock exchange 'tabName': 'fulltext', 'sortName':'', 'sortType':'', 'limit': '', 'seDate': '', } else: print("Wrong ticker") continue yield SplashFormRequest( url='http://www.cninfo.com.cn/cninfo-new/announcement/query', formdata=data, callback=self.parse, # args={'wait': 2} )
def get_result_debito_ipva(self, response): cota3 = response.url.split("cota3=")[-1] url = "http://www.sefaz.ba.gov.br/scripts/ipva/dae/VeiculoCadastrado/debito_calculado_ipva.asp?cota3={}".format( cota3) today = dt.now().strftime("%d/%m/%Y") frm_data = {'txt_dtc_pagamento': today} #yield FormRequest(url, formdata=frm_data, callback=self.get_debito_calculado_ipva, # errback=self.errback_func, dont_filter=True) yield SplashFormRequest(url, formdata=frm_data, callback=self.get_debito_calculado_ipva, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={ 'lua_source': script, 'cookies': response.data['cookies'] }, dont_filter=True)
def login_me(self, response): login_url = "https://auth.netcombo.com.br/login" client_id = response.selector.xpath( "//input[@name='client_id']/@value").get("") redirect_uri = response.selector.xpath( "//input[@name='redirect_uri']/@value").get("") response_type = response.selector.xpath( "//input[@name='response_type']/@value").get("") scope = response.selector.xpath("//input[@name='scope']/@value").get( "") state = response.selector.xpath("//input[@name='state']/@value").get( "") authMs = response.selector.xpath("//input[@name='authMs']/@value").get( "") frm_data = { 'Username': self.login, 'password': self.senha, 'client_id': client_id, 'redirect_uri': redirect_uri, 'response_type': response_type, 'scope': scope, 'state': state, 'authMs': authMs, 'Auth_method': 'UP' } print(frm_data) yield SplashFormRequest(login_url, formdata=frm_data, callback=self.select_contract, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={ 'lua_source': script_30_sec_wait, 'cookies': response.data['cookies'], 'timeout': 60, 'images': 0 }, dont_filter=True)
def get_ipva_search(self, response): action = response.selector.xpath("//input[@id='action']/@value").get("") csrf_token = response.selector.xpath("//input[@id='csrf_token']/@value").get("") sitekey = response.selector.xpath( "//script[contains(@src,'recaptcha/api')]/@src").get("").split("render=")[-1] gcaptcha_txt = self.solve_captcha(sitekey, response.request.url, captcha_type=5, captcha_action='portal_consulta_renavam') if not gcaptcha_txt: return frm_data = {'action': action, 'renavam': response.meta['renavam'], 'csrf_token': csrf_token, 'recaptcha_response': gcaptcha_txt} ipva_url = "https://portalcontribuinte.sefin.ro.gov.br/Publico/__Resultado_Renavam_.jsp" yield SplashFormRequest(ipva_url, formdata=frm_data, endpoint='render.json', args=self.splash_args, meta={'renavam': response.meta['renavam']}, callback=self.get_ipva_result, dont_filter=True)
def login_me(self, response): """Function to get request options to login. Used to get ReCaptcha token; image captcha value.""" # get the Captcha's options sitekey = response.selector.xpath( "//div[@class='g-recaptcha']/@data-sitekey").get("") gcaptcha_txt = self.solve_captcha(sitekey, response.url) if not gcaptcha_txt: return frm_data = { 'Renavam': self.renavam, 'Placa': self.placa, 'g-recaptcha-response': gcaptcha_txt} # yield FormRequest(self.start_url, formdata=frm_data, callback=self.get_main_page, dont_filter=True) yield SplashFormRequest(self.start_url, formdata=frm_data, callback=self.get_main_page, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={'lua_source': autos_detran_ro_script}, dont_filter=True)
def search_result(self, response): if response.css("#ctl00_RadWindow1_C_btnOk"): form_data = dict() input_fields = response.css("form input") for ifield in input_fields: if ifield.css("input::attr(name)").extract_first() == None: continue if ifield.css("input::attr(value)").extract_first() == None: form_data[ifield.css( "input::attr(name)").extract_first()] = '' elif ifield.css( "input::attr(value)").extract_first() == 'I Disagree': continue elif ifield.css("input::attr(value)").extract_first() == '': form_data[ifield.css( "input::attr(name)").extract_first()] = '' else: form_data[ifield.css("input::attr(name)").extract_first( )] = ifield.css("input::attr(value)").extract_first() yield SplashFormRequest( url= 'https://pdonline.brisbane.qld.gov.au/MasterPlan/Modules/Enquirer/PropertySearch.aspx', formdata=form_data, callback=self.search_result, endpoint='execute', args={ 'timeout': '90', 'wait': '30', 'lua_source': script, }, session_id='1', meta={ 'input_data': response.meta["input_data"], }) else: # Cookie handling for cookie in response.data["cookies"]: self.dynamic_cookie[cookie["name"]] = cookie["value"] form_data = dict() input_fields = response.css("form input") for ifield in input_fields: if ifield.css("input::attr(name)").extract_first() == None: continue if ifield.css("input::attr(value)").extract_first() == None: form_data[ifield.css( "input::attr(name)").extract_first()] = '' else: form_data[ifield.css("input::attr(name)").extract_first( )] = ifield.css("input::attr(value)").extract_first() for key in response.meta["input_data"]: # if response.meta["input_data"][key]: if key == "Suburb": form_data["ctl00$MainContent$SuburbCombo"] = response.meta[ "input_data"][key] elif key == "Street Name": street_name_define = { "logEntries": [], "value": "", "text": "", "enabled": True, "checkedIndices": [], "checkedItemsTextOverflows": False } street_name_define["value"] = response.meta["input_data"][ key] street_name_define["text"] = response.meta["input_data"][ key] form_data[ "ctl00_MainContent_StreetCombo_ClientState"] = street_name_define form_data["ctl00$MainContent$StreetCombo"] = response.meta[ "input_data"][key] elif key == "Unit Number From": form_data[ "ctl00$MainContent$FromUnitNumberTextBox"] = response.meta[ "input_data"][key] ctl00_MainContent_FromUnitNumberTextBox_ClientState = { "enabled": True, "emptyMessage": "", "validationText": "", "valueAsString": "", "lastSetTextBoxValue": "" } ctl00_MainContent_FromUnitNumberTextBox_ClientState[ "validationText"] = response.meta["input_data"][key] ctl00_MainContent_FromUnitNumberTextBox_ClientState[ "valueAsString"] = response.meta["input_data"][key] ctl00_MainContent_FromUnitNumberTextBox_ClientState[ "lastSetTextBoxValue"] = response.meta["input_data"][ key] form_data[ "ctl00_MainContent_FromUnitNumberTextBox_ClientState"] = ctl00_MainContent_FromUnitNumberTextBox_ClientState elif key == "Unit Number To": form_data[ "ctl00$MainContent$ToUnitNumberTextBox"] = response.meta[ "input_data"][key] ctl00_MainContent_ToUnitNumberTextBox_ClientState = { "enabled": True, "emptyMessage": "", "validationText": "", "valueAsString": "", "lastSetTextBoxValue": "" } ctl00_MainContent_ToUnitNumberTextBox_ClientState[ "validationText"] = response.meta["input_data"][key] ctl00_MainContent_ToUnitNumberTextBox_ClientState[ "valueAsString"] = response.meta["input_data"][key] ctl00_MainContent_ToUnitNumberTextBox_ClientState[ "lastSetTextBoxValue"] = response.meta["input_data"][ key] form_data[ "ctl00_MainContent_ToUnitNumberTextBox_ClientState"] = ctl00_MainContent_ToUnitNumberTextBox_ClientState elif key == "Street Number From": form_data[ "ctl00$MainContent$FromStreetNumberTextBox"] = response.meta[ "input_data"][key] ctl00_MainContent_FromStreetNumberTextBox_ClientState = { "enabled": True, "emptyMessage": "", "validationText": "", "valueAsString": "", "lastSetTextBoxValue": "" } ctl00_MainContent_FromStreetNumberTextBox_ClientState[ "validationText"] = response.meta["input_data"][key] ctl00_MainContent_FromStreetNumberTextBox_ClientState[ "valueAsString"] = response.meta["input_data"][key] ctl00_MainContent_FromStreetNumberTextBox_ClientState[ "lastSetTextBoxValue"] = response.meta["input_data"][ key] form_data[ "ctl00_MainContent_FromStreetNumberTextBox_ClientState"] = ctl00_MainContent_FromStreetNumberTextBox_ClientState elif key == "Street Number To": form_data[ "ctl00$MainContent$ToStreetNumberTextBox"] = response.meta[ "input_data"][key] ctl00_MainContent_ToStreetNumberTextBox_ClientState = { "enabled": True, "emptyMessage": "", "validationText": "", "valueAsString": "", "lastSetTextBoxValue": "" } ctl00_MainContent_ToStreetNumberTextBox_ClientState[ "validationText"] = response.meta["input_data"][key] ctl00_MainContent_ToStreetNumberTextBox_ClientState[ "valueAsString"] = response.meta["input_data"][key] ctl00_MainContent_ToStreetNumberTextBox_ClientState[ "lastSetTextBoxValue"] = response.meta["input_data"][ key] form_data[ "ctl00_MainContent_ToStreetNumberTextBox_ClientState"] = ctl00_MainContent_ToStreetNumberTextBox_ClientState elif key == "Plan Number": form_data["ctl00$MainContent$PlanTextBox"] = response.meta[ "input_data"][key] ctl00_MainContent_PlanTextBox_ClientState = { "enabled": True, "emptyMessage": "", "validationText": "", "valueAsString": "", "lastSetTextBoxValue": "" } ctl00_MainContent_PlanTextBox_ClientState[ "validationText"] = response.meta["input_data"][key] ctl00_MainContent_PlanTextBox_ClientState[ "valueAsString"] = response.meta["input_data"][key] ctl00_MainContent_PlanTextBox_ClientState[ "lastSetTextBoxValue"] = response.meta["input_data"][ key] elif key == "Lot Number": form_data["ctl00$MainContent$LotTextBox"] = response.meta[ "input_data"][key] ctl00_MainContent_LotTextBox_ClientState = { "enabled": True, "emptyMessage": "", "validationText": "", "valueAsString": "", "lastSetTextBoxValue": "" } ctl00_MainContent_LotTextBox_ClientState[ "validationText"] = response.meta["input_data"][key] ctl00_MainContent_LotTextBox_ClientState[ "valueAsString"] = response.meta["input_data"][key] ctl00_MainContent_LotTextBox_ClientState[ "lastSetTextBoxValue"] = response.meta["input_data"][ key] form_data[ "ctl00_MainContent_LotTextBox_ClientState"] = ctl00_MainContent_LotTextBox_ClientState else: pass for key in form_data: if isinstance(form_data[key], dict): form_data[key] = json.dumps(form_data[key]) if isinstance(form_data[key], int): form_data[key] = str(form_data[key]) # first method that i used to submit form yield FormRequest( url= 'https://pdonline.brisbane.qld.gov.au/MasterPlan/Modules/Enquirer/PropertySearch.aspx', formdata=form_data, callback=self.submit_form, cookies=self.dynamic_cookie) # Second method that i have used to submit form yield SplashFormRequest( url= 'https://pdonline.brisbane.qld.gov.au/MasterPlan/Modules/Enquirer/PropertySearch.aspx', formdata=form_data, callback=self.submit_form, endpoint='execute', args={ 'timeout': '90', 'wait': '30', 'lua_source': script, }, session_id='1')
def login_me(self, response): renavam = response.selector.xpath( "//span[@id='conteudoPaginaPlaceHolder_txtRenavam']/text()").get( "").strip() placa = response.selector.xpath( "//span[@id='conteudoPaginaPlaceHolder_txtPlaca']/text()").get( "").strip() print("renavam:", renavam) print("placa:", placa) error_message = response.selector.xpath( "//span[@id='conteudoPaginaPlaceHolder_lblErro']/text()").get("") if "Preencha o campo 'Placa' corretamente." in error_message: error_msg = { "error_type": "WRONG_CREDENTIALS", "details": error_message } self.errors.append(error_msg) self.logger.warning(error_msg) return elif "Favor validar o captcha corretamente" in error_message: self.incorrect_captcha_report(self.captcha_service, self.g_recaptcha_id) if self.incorrect_captcha_retries > 0: yield SplashRequest(self.start_url, callback=self.get_login_page, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={'lua_source': script_10_sec_wait}, dont_filter=True) return # create screenshot using imgkit if self.capture_screenshot: self.take_screenshot(response, url_path='IPVANET_Consulta') regex = re.compile(r'\s+') marca_modelo = regex.sub( " ", response.selector.xpath( "//span[@id='conteudoPaginaPlaceHolder_txtMarcaModelo']/text()" ).get("").strip()) faixa_do_ipva = regex.sub( " ", response.selector.xpath( "//span[@id='conteudoPaginaPlaceHolder_txtFaixaIPVA']/text()"). get("").strip()) ano_de_fabricacao = regex.sub( " ", response.selector.xpath( "//span[@id='conteudoPaginaPlaceHolder_txtAnoFabric']/text()"). get("").strip()) municipio = regex.sub( " ", response.selector.xpath( "//span[@id='conteudoPaginaPlaceHolder_txtMunicipio']/text()"). get("").strip()) combustivel = regex.sub( " ", response.selector.xpath( "//span[@id='conteudoPaginaPlaceHolder_txtCombustivel']/text()" ).get("").strip()) especie = regex.sub( " ", response.selector.xpath( "//span[@id='conteudoPaginaPlaceHolder_txtEspecie']/text()"). get("").strip()) categoria = regex.sub( " ", response.selector.xpath( "//span[@id='conteudoPaginaPlaceHolder_txtCategoria']/text()"). get("").strip()) tipo = regex.sub( " ", response.selector.xpath( "//span[@id='conteudoPaginaPlaceHolder_txtTipo']/text()").get( "").strip()) passageiros = regex.sub( " ", response.selector.xpath( "//span[@id='conteudoPaginaPlaceHolder_txtPassageiros']/text()" ).get("").strip()) carroceria = regex.sub( " ", response.selector.xpath( "//span[@id='conteudoPaginaPlaceHolder_txtCarroceria']/text()" ).get("").strip()) ultimo_licenciamento = regex.sub( " ", response.selector.xpath( "//span[@id='conteudoPaginaPlaceHolder_txtAnoUltLicen']/text()" ).get("").strip()) self.result.update({ 'renavam': renavam, 'placa': placa, 'marca_modelo': marca_modelo, 'faixa_do_ipva': faixa_do_ipva, 'ano_de_fabricacao': ano_de_fabricacao, 'municipio': municipio, 'combustivel': combustivel, 'especie': especie, 'categoria': categoria, 'tipo': tipo, 'passageiros': passageiros, 'carroceria': carroceria, 'ultimo_licenciamento': ultimo_licenciamento }) tables = response.selector.xpath( "//div[@id='conteudoPaginaPlaceHolder_Panel1']/table[.//td[@class='alinharEsquerda, negrito' and ./span[contains(@id,'conteudoPaginaPlaceHolder_Label') and not(text()=' ')]]]" ) for table in tables[1:-1]: table_name = " ".join([ t.strip() for t in table.xpath( ".//td[@class='alinharEsquerda, negrito' and ./span[contains(@id,'conteudoPaginaPlaceHolder_Label') and not(text()=' ')]]/span/text()" ).extract() ]) table_name = self.remove_diacritics(table_name) main_table = table.xpath( "./following::table[@class='loginTable' and .//tr[not(@class) and ./td[not(@class)]/span]][1]" ) rows = main_table.xpath(".//tr[not(@class)]") if table_name == "ipva_2020": table_content = {} for row in rows: title = self.remove_diacritics( row.xpath("./td[1]/span/text()").get("").strip()) value = re.sub( '\s+', " ", " ".join( row.xpath( "./td[last()]/span/text()").extract()).strip()) if title: table_content.update({title: value}) self.result.update({table_name: table_content}) second_table = main_table.xpath( "./following::table[@class='loginTable'][1]//tr[1]/td/span" ) st_name = self.remove_diacritics( second_table.xpath("./text()").get("").strip()) st_rows = second_table.xpath( "./following::table[@class='loginTable'][1]//tr") table_content = {} for row in st_rows[1:]: title = row.xpath("./td[1]/span/text()").get("").strip() date = row.xpath("./td[2]/span/text()").get("").strip() value = re.sub( '\s+', " ", " ".join( row.xpath( "./td[last()]/span/text()").extract()).strip()) if title and date: table_content.update({ 'modalidades_disponiveis': title, 'vencimento': date, 'valor': value }) if table_content: self.result.update({st_name: table_content}) elif table_name != "ipva_2020" and table_name != 'taxas': table_content = [] for row in rows[1:]: exercicio = row.xpath("./td[1]/span/text()").get( "").strip() valor = row.xpath("./td[last()]/span/text()").get( "").strip() is_valor_table = rows[0].xpath( ".//span[contains(text(),'Valor')]") if exercicio: rows_data = {'exercicio': exercicio} if is_valor_table: rows_data.update({'valor': valor}) table_content.append(rows_data) self.result.update({table_name: table_content}) elif table_name == "taxas": table_content = {} taxas_type = rows[0].xpath("./td[1]/span/text()").get( "").strip() table_content.update({'type': taxas_type}) # workaround for different types is_nada_costa = rows[1].xpath( ".//span[contains(text(),'NADA CONSTA')]") if is_nada_costa: i = 1 else: i = 2 for row in rows[i:]: title = self.remove_diacritics( row.xpath("./td[1]/span/text()").get("").strip()) value = row.xpath("./td[last()]/span/text()").get( "").strip() if title: table_content.update({title: value}) self.result.update({table_name: table_content}) multas_btn = response.selector.xpath("//input[contains(@id,'Multas')]") if multas_btn: # Get options for request multas_btn_name = multas_btn.xpath("./@name").get("").strip() multas_btn_value = multas_btn.xpath("./@value").get("").strip() EVENTTARGET = response.selector.xpath( "//input[@id='__EVENTTARGET']/@value").get("") EVENTARGUMENT = response.selector.xpath( "//input[@id='__EVENTARGUMENT']/@value").get("") VIEWSTATE = response.selector.xpath( "//input[@id='__VIEWSTATE']/@value").get("") VIEWSTATEGENERATOR = response.selector.xpath( "//input[@id='__VIEWSTATEGENERATOR']/@value").get("") EVENTVALIDATION = response.selector.xpath( "//input[@id='__EVENTVALIDATION']/@value").get("") frm_data = { '__EVENTTARGET': EVENTTARGET, '__EVENTARGUMENT': EVENTARGUMENT, '__VIEWSTATE': VIEWSTATE, '__VIEWSTATEGENERATOR': VIEWSTATEGENERATOR, '__EVENTVALIDATION': EVENTVALIDATION } frm_data.update({multas_btn_name: multas_btn_value}) multas_url = "https://www.ipva.fazenda.sp.gov.br/IPVANET_Consulta/Pages/Aviso.aspx" yield SplashFormRequest(multas_url, formdata=frm_data, callback=self.multas_table, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={ 'lua_source': script_10_sec_wait, 'cookies': response.data['cookies'] }, dont_filter=True) else: self.result['multas'] = []
def get_main_page(self, response): """Redirect to main page.""" error_message = response.selector.xpath("//div[@class='msgErro']/text()").get("") print(error_message) if "Nenhum registro encontrado, verifique os dados digitados" in error_message: error_msg = {"error_type": "WRONG_CREDENTIALS", "details": error_message} self.errors.append(error_msg) self.logger.warning(error_msg) return elif "Confirme que você não é um robô." in error_message: self.incorrect_captcha_report( self.captcha_service, self.g_recaptcha_id) if self.incorrect_captcha_retries > 0: yield Request(self.start_url, callback=self.login_me, meta={'dont_merge_cookies': True}, dont_filter=True) return regex = re.compile(r'\s+') self.result['placa'] = regex.sub(" ", response.selector.xpath("//span[text()='Placa']/../text()").get("").strip()) self.result['marca_modelo'] = regex.sub(" ", response.selector.xpath("//span[text()='Marca/Modelo']/../text()").get("").strip()) self.result['fabricacao_modelo'] = regex.sub(" ", response.selector.xpath("//span[text()='Fabricacao/Modelo']/../text()").get("").strip()) self.result['cor'] = regex.sub(" ", response.selector.xpath("//span[text()='Cor']/../text()").get("").strip()) renavam = regex.sub(" ", response.selector.xpath("//span[text()='Renavam']/../text()").get("").strip()) self.result['renavam'] = renavam self.result['tipo'] = regex.sub(" ", response.selector.xpath("//span[text()='Tipo']/../text()").get("").strip()) self.result['carroceria'] = regex.sub(" ", response.selector.xpath("//span[text()='Carroceria']/../text()").get("").strip()) self.result['especie'] = regex.sub(" ", response.selector.xpath("//span[text()='Especie']/../text()").get("").strip()) self.result['lugares'] = regex.sub(" ", response.selector.xpath("//span[text()='Lugares']/../text()").get("").strip()) self.result['categoria'] = regex.sub(" ", response.selector.xpath("//span[text()='Categoria']/../text()").get("").strip()) self.result['potencia'] = regex.sub(" ", response.selector.xpath("//span[text()='Potência']/../text()").get("").strip()) self.result['combustivel'] = regex.sub(" ", response.selector.xpath("//span[text()='Combustível']/../text()").get("").strip()) self.result['nome_do_proprietario'] = regex.sub(" ", response.selector.xpath("//span[text()='Nome do Proprietário']/../text()").get("").strip()) self.result['situacao_lacre'] = regex.sub(" ", response.selector.xpath("//span[text()='Situação Lacre']/../text()").get("").strip()) self.result['proprietario_anterior'] = regex.sub(" ", response.selector.xpath("//span[text()='Proprietário Anterior']/../text()").get("").strip()) self.result['origem_dos_dados_do_veiculo'] = regex.sub(" ", response.selector.xpath("//span[text()='Origem dos Dados do Veículo']/../text()").get("").strip()) self.result['placa_anterior'] = regex.sub(" ", response.selector.xpath("//span[text()='Placa Anterior']/../text()").get("").strip()) self.result['municipio_de_emplacamento'] = regex.sub(" ", response.selector.xpath("//span[text()='Municipio de Emplacamento']/../text()").get("").strip()) self.result['licenciado_ate'] = regex.sub(" ", response.selector.xpath("//span[text()='Licenciado ate']/../text()").get("").strip()) self.result['adquirido_em'] = regex.sub(" ", response.selector.xpath("//span[text()='Adquirido em']/../text()").get("").strip()) self.result['situacao'] = regex.sub(" ", response.selector.xpath("//span[text()='Situação']/../text()").get("").strip()) self.result['restricao_a_venda'] = regex.sub(" ", response.selector.xpath("//span[text()='Restrição a Venda']/../text()").get("").strip()) self.result['informacoes_pendentes_originadas_das_financeiras_via_sng_sistema_nacional_de_gravame'] = regex.sub(" ", response.selector.xpath( "//span[text()='Informações PENDENTES originadas das financeiras via SNG - Sistema Nacional de Gravame']/../text()").get("").strip()) self.result['impedimentos'] = regex.sub(" ", response.selector.xpath("//span[text()='Impedimentos']/../text()").get("").strip()) debitos_rows = response.selector.xpath("//div[@id='corpo_DebitosVeiculo']/div[@id='Integral']/table[@id='TabelaIntegral']/tbody/tr") debitos = [] for row in debitos_rows: descricao = regex.sub(" ", row.xpath("./td[1]/text()").get("").strip()) vencimento = regex.sub(" ", row.xpath("./td[2]/text()").get("").strip()) nominal_r = regex.sub(" ", row.xpath("./td[3]/text()").get("").strip()) corrigido_r = regex.sub(" ", row.xpath("./td[4]/text()").get("").strip()) desconto_r = regex.sub(" ", row.xpath("./td[5]/text()").get("").strip()) juros_r = regex.sub(" ", row.xpath("./td[6]/text()").get("").strip()) multa_r = regex.sub(" ", row.xpath("./td[7]/text()").get("").strip()) atual_r = regex.sub(" ", row.xpath("./td[8]/text()").get("").strip()) debitos.append({ 'descricao': descricao, 'vencimento': vencimento, 'nominal_r': nominal_r, 'corrigido_r': corrigido_r, 'desconto_r': desconto_r, 'juros_r': juros_r, 'multa_r': multa_r, 'atual_r': atual_r}) self.result.update({'debitos': debitos}) infracoes_em_autuacao_rows = response.selector.xpath( "//div[@id='corpo_AutuacoesVeiculo']//tbody/tr") infracoes_em_autuacao = [] for row in infracoes_em_autuacao_rows: num_auto = " ".join([s.strip() for s in row.xpath("./td[1]/text()").extract() if s.strip()]) if "veículo até o momento." in num_auto: break status = regex.sub(" ", " ".join(row.xpath("./td[2]/text()").extract()).strip()) descricao = regex.sub(" ", " ".join(row.xpath("./td[3]/text()").extract()).strip()) local_complemento = regex.sub(" ", " ".join(row.xpath("./td[4]/text()").extract()).strip()) valor = regex.sub(" ", " ".join(row.xpath("./td[5]/text()").extract()).strip()) infracoes_em_autuacao.append({ 'num_auto': num_auto, 'status': status, 'descricao': descricao, 'local_complemento': local_complemento, 'valor': valor}) self.result.update({'infracoes_em_autuacao': infracoes_em_autuacao}) penalidades_multas_rows = response.selector.xpath( "//div[@id='corpo_MultasVeiculo']//tbody/tr") penalidades_multas = [] for row in penalidades_multas_rows: num_auto = regex.sub(" ", " ".join(row.xpath("./td[1]/text()").extract()).strip()) if "veículo até o momento." in num_auto: break status = regex.sub(" ", " ".join(row.xpath("./td[2]/text()").extract()).strip()) descricao = regex.sub(" ", " ".join(row.xpath("./td[3]/text()").extract()).strip()) local_complemento = regex.sub(" ", " ".join(row.xpath("./td[4]/text()").extract()).strip()) valor = regex.sub(" ", " ".join(row.xpath("./td[5]/text()").extract()).strip()) penalidades_multas.append({ 'num_auto': num_auto, 'status': status, 'descricao': descricao, 'local_complemento': local_complemento, 'valor': valor}) self.result.update({'penalidades_multas': penalidades_multas}) recursos_infracao_rows = response.selector.xpath( "//div[@id='corpo_RecursosInfracao']//tbody/tr") recursos_infracao = [] for row in recursos_infracao_rows: processo = regex.sub(" ", " ".join(row.xpath("./td[1]/text()").extract()).strip()) if "veículo até o momento." in processo: break n_proc_renainf = regex.sub(" ", " ".join(row.xpath("./td[2]/text()").extract()).strip()) numero_do_auto = regex.sub(" ", " ".join(row.xpath("./td[3]/text()").extract()).strip()) detalhamento_da_infracao = regex.sub(" ", " ".join(row.xpath("./td[4]/text()").extract()).strip()) situacao_do_processo = regex.sub(" ", " ".join(row.xpath("./td[5]/text()").extract()).strip()) recursos_infracao.append({ 'processo': processo, 'nº_proc_renainf': n_proc_renainf, 'numero_do_auto': numero_do_auto, 'detalhamento_da_infracao': detalhamento_da_infracao, 'situacao_do_processo': situacao_do_processo}) self.result.update({'recursos_infracao': recursos_infracao}) dare_btn = response.selector.xpath("//input[@id='BotaoIntegral']") if self.get_files and dare_btn: dare_url = "https://consulta.detran.ro.gov.br/CentralDeConsultasInternet/Internet/DARE.asp" hdListaIdDebitos = response.selector.xpath("//input[@name='hdListaIdDebitos']/@value").get("") hdPlaca = response.selector.xpath("//input[@name='hdPlaca']/@value").get("") frm_data = {'hdListaIdDebitos': hdListaIdDebitos, 'hdPlaca': hdPlaca} print(frm_data) yield SplashFormRequest(dare_url, formdata=frm_data, endpoint='render.json', args=self.splash_args, meta={'result_key': 'debitos'}, callback=self.print_html_to_pdf, dont_filter=True) ipva_url = "https://portalcontribuinte.sefin.ro.gov.br/Publico/ConsultaRenavam.jsp?renavam={}".format(renavam) yield Request(ipva_url, callback=self.get_ipva_search, meta={'renavam': renavam}, dont_filter=True)
def get_main_page(self, response): error_message = response.selector.xpath( "//ul[contains(@class,'alert-error') and not(@style)]/li/span/text()" ).get("") if error_message: error_msg = { "error_type": "WRONG_CREDENTIALS", "details": error_message } self.errors.append(error_msg) self.logger.error(error_msg) return tables = response.selector.xpath("//table[@class='tabela']") regex = re.compile(r'\s+') for table in tables: title = self.remove_diacritics( regex.sub( " ", table.xpath( ".//tbody/tr/td[not(./span[@id])]/strong/text()").get( ""))) if "licenciamento_digital" in title: continue elif "laudo_de_vistoria" in title: rows = response.selector.xpath( "//table[@class='tableResultadoLaudo']/tbody/tr") table_content = [] regex = re.compile(r'\s+') for row in rows: data_da_vistoria = regex.sub( " ", " ".join(row.xpath( "./td[1]/span//text()").extract()).strip()) empresa_responsavel = regex.sub( " ", " ".join(row.xpath( "./td[2]/span//text()").extract()).strip()) km = regex.sub( " ", " ".join(row.xpath( "./td[3]/span//text()").extract()).strip()) resultado_da_vistoria = regex.sub( " ", " ".join(row.xpath( "./td[4]/span//text()").extract()).strip()) motivo_resultado = regex.sub( " ", " ".join(row.xpath( "./td[5]/span//text()").extract()).strip()) situacao = regex.sub( " ", " ".join(row.xpath( "./td[6]/span//text()").extract()).strip()) table_content.append({ 'data_da_vistoria': data_da_vistoria, 'empresa_responsavel': empresa_responsavel, 'km': km, 'resultado_da_vistoria': resultado_da_vistoria, 'motivo_resultado': motivo_resultado, 'situacao': situacao }) if table_content: self.result[title] = table_content else: self.result[ "sem_laudo_de_vistoria"] = "Não existem vistorias eletrônicas realizadas no estado de São Paulo para o veiculo." else: rows = table.xpath(".//tbody/tr[.//span[@id]]/td") table_content = {} for row in rows: key = self.remove_diacritics( regex.sub( " ", row.xpath("./strong/text()").get("").strip())) value = regex.sub( " ", " ".join( row.xpath("./span//text() | ./text() | ./a/text()" ).extract()).strip()) table_content[key] = value if table_content: self.result[title] = table_content # get formdata multas_form = response.selector.xpath("//div[@class='container']/form") form_name = multas_form.xpath("./@name").get("") frm_data = {} form_inputs = multas_form.xpath(".//input") for inpt in form_inputs: inpt_name = inpt.xpath("./@name").get("") inpt_val = inpt.xpath("./@value").get("") frm_data.update({inpt_name: inpt_val}) # get_detalhes_das_multas btn_id = response.selector.xpath( "//table[@class='tabela' and .//strong[contains(text(),'Multas')]]//a/@id" ).get("") if btn_id: frm_data_copy = frm_data.copy() frm_data_copy.update({"{}:_idcl".format(form_name): btn_id}) url = "http://www.detran.sp.gov.br" + multas_form.xpath( "./@action").get("") yield SplashFormRequest(url, formdata=frm_data_copy, callback=self.get_detalhes_das_multas, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={ 'lua_source': script_10_sec_wait, 'cookies': response.data['cookies'] }, dont_filter=True) # get file if self.get_files: btn_id = response.selector.xpath("//a[@title='Imprimir']/@id").get( "") if btn_id: frm_data_copy = frm_data.copy() frm_data_copy.update({ form_name: form_name, 'javax.faces.behavior.event': 'click', 'javax.faces.partial.event': 'click', 'javax.faces.source': btn_id, 'javax.faces.partial.ajax': 'true', 'javax.faces.partial.execute': btn_id }) url = "http://www.detran.sp.gov.br" + frm_data_copy[ 'javax.faces.encodedURL'] yield SplashFormRequest(url, formdata=frm_data_copy, callback=self.downoad_request, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={ 'lua_source': script_10_sec_wait, 'cookies': response.data['cookies'] }, dont_filter=True)
def parse(self, response): """响应处理函数""" # 判断响应结果 # if not isinstance(response, HtmlResponse): if not isinstance(response, Response): self.logger.info("non-HTML response is skipped: %s", response.url) return # 根目录选择器 root_selectors = [ i for i in self.selectors if i["parentSelectors"] == "_root" ] # 结果集 result = {} # 翻页url link = '' # 遍历根目录选择器 for root_selector in root_selectors: # text类型选择器 if root_selector["type"] == "SelectorText": text = self.text_resolve(root_selector, response) # 非多元素 if text and not root_selector["multiple"]: text = [text[0]] # 正则过滤 if 'regex' in root_selector and root_selector["regex"]: text = [ re.findall(root_selector["regex"], i)[0] if re.findall( root_selector["regex"], i) else "" for i in text ] result[root_selector['name']] = text if root_selector[ "multiple"] or not text else text[0] # image类型选择器 elif root_selector["type"] == "SelectorImage": image = self.image_resolve(root_selector, response, response.url) # 非多元素 if image and not root_selector["multiple"]: image = [image[0]] result[root_selector['name']] = image if root_selector[ "multiple"] or not image else image[0] # attribute类型选择器 elif root_selector["type"] == "SelectorElementAttribute": attribute = self.attribute_resolve(root_selector, response) # 非多元素 if attribute and not root_selector["multiple"]: attribute = [attribute[0]] # 正则过滤 if 'regex' in root_selector and root_selector["regex"]: attribute = [ re.findall(root_selector["regex"], i)[0] if re.findall( root_selector["regex"], i) else "" for i in attribute ] result[root_selector['name']] = attribute if root_selector[ "multiple"] or not attribute else attribute[0] # html类型选择器 elif root_selector["type"] == "SelectorHTML": html = self.html_resolve(root_selector, response) # 非多元素 if html and not root_selector["multiple"]: html = [html[0]] # 正则过滤 if 'regex' in root_selector and root_selector["regex"]: html = [ re.findall(root_selector["regex"], i)[0] if re.findall( root_selector["regex"], i) else "" for i in html ] result[root_selector['name']] = html if root_selector[ "multiple"] or not html else html[0] # link类型选择器 elif root_selector["type"] == "SelectorLink": if 'first' not in response.meta: link = self.link_resolve(root_selector, response) # element类型选择器 element_selectors = [ i for i in self.selectors if i["parentSelectors"] == "_root" and i["type"] == "SelectorElement" ] # detail类型选择器 detail_selectors = [ i for i in self.selectors if i["parentSelectors"] == "_root" and i["type"] == "SelectorDetail" ] if element_selectors: yield from self.element_parse(element_selectors[0], response, result, response.url) elif detail_selectors: detail_urls = self.detail_resolve(detail_selectors[0], response, response.url) child_selectors = [ i for i in self.selectors if i["parentSelectors"] == detail_selectors[0]['name'] ] if detail_urls and child_selectors: yield from self.detail_parse(detail_selectors[0], detail_urls, child_selectors, result) else: yield result else: yield result # 翻页 print('--------------------------') print('next page', link) if link: if isinstance(link, tuple): pagerange, regex = link[0], link[1] try: args = eval(pagerange) except Exception as e: args = 0 if not isinstance(args, tuple): args = (0, args) for i in range(*args): if not self.dynamic: if self.method.lower() == 'post': request = FormRequest(url=re.sub( regex, str(i), response.url), formdata=self.form, headers=self.headers, cookies=self.cookies, meta={'first': False}, callback=self.parse) else: request = Request(url=re.sub( regex, str(i), response.url), headers=self.headers, cookies=self.cookies, meta={'first': False}) else: if self.method.lower() == 'post': request = SplashFormRequest(url=re.sub( regex, str(i), response.url), formdata=self.form, meta={'first': False}, args=self.args_data, callback=self.parse) else: request = SplashRequest(url=re.sub( regex, str(i), response.url), meta={'first': False}, callback=self.parse, endpoint='execute', args=self.args_data) yield request if isinstance(link, str): print('下一页', link) if not self.dynamic: if self.method.lower() == 'post': request = FormRequest(url=link, formdata=self.form, headers=self.headers, cookies=self.cookies, callback=self.parse, dont_filter=True) else: request = Request(url=link, headers=self.headers, cookies=self.cookies, callback=self.parse) else: if self.method.lower() == 'post': request = SplashFormRequest(url=link, formdata=self.form, callback=self.parse, dont_filter=True, args=self.args_data) else: request = SplashRequest(url=link, endpoint='execute', callback=self.parse, args=self.args_data) yield request
def parse(self, response): #专利名称 title = response.xpath("//h1") #申请公布号 openNo = response.xpath( '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[1]') #申请公布日 openDate = response.xpath( '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[2]') #申请号 applyNo = response.xpath( '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[3]') #申请日 applyDate = response.xpath( '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[4]') #申请人 applyPeople = response.xpath( '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[5]') #发明人 inventor = response.xpath( '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[6]') #地址 address = response.xpath( '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[8]') #分类号 classifyNo = response.xpath( '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[9]') #摘要 summery = response.xpath( '//div[@class="cp_box"]/div/div[@class="cp_jsh"]') #二维码 qrcodeurls = response.xpath('//div[@class="cp_box"]/a/img/@src') #缩略图 thumb = response.xpath( '//div[@class="cp_box"]/div[@class="cp_img"]/img/@src') baseUrl = 'http://epub.sipo.gov.cn/' for ti, on, od, an, ad, ap, inv, add, cf, su, qr, th in zip( title, openNo, openDate, applyNo, applyDate, applyPeople, inventor, address, classifyNo, summery, qrcodeurls, thumb): item = ZlItem() item['title'] = ti.xpath( "string(.)").extract_first().strip().split()[1] item['openNo'] = on.xpath("string(.)").extract_first()[6:] item['openDate'] = od.xpath("string(.)").extract_first()[6:] item['applyNo'] = an.xpath("string(.)").extract_first()[5:] item['applyDate'] = ad.xpath("string(.)").extract_first()[4:] item['applyPeople'] = ap.xpath("string(.)").extract_first()[4:] item['inventor'] = ''.join( inv.xpath("string(.)").extract_first()[4:].strip().split()) item['address'] = add.xpath("string(.)").extract_first()[3:] item['classifyNo'] = cf.xpath( "string(.)").extract_first().split()[0][4:] item['summery'] = su.xpath("string(.)").extract_first().split()[1] item['qrcodeurls'] = baseUrl + qr.extract() item['thumb'] = baseUrl + th.extract() yield item headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } yield SplashFormRequest( url='http://epub.sipo.gov.cn/pam.action', callback=self.downparse, method='POST', args={'wait': 5}, formdata={ "strWhere": "PN='" + "{}".format(on.xpath("string(.)").extract_first()[6:]) + "'", "strSources": "pip" }, headers=headers)