예제 #1
0
def test_splash_form_request():
    req = SplashFormRequest('http://example.com', formdata={'foo': 'bar'})
    assert req.method == 'POST'
    assert req.body == b'foo=bar'
    assert req.meta['splash']['args']['url'] == 'http://example.com'

    req = SplashFormRequest('http://example.com',
                            method='GET',
                            formdata={'foo': 'bar'},
                            endpoint='execute')
    assert req.method == 'GET'
    assert req.body == b''
    assert req.url == req.meta['splash']['args']['url'] ==\
        'http://example.com?foo=bar'
    assert req.meta['splash']['endpoint'] == 'execute'
    def parse_field(self, response):

        selector = Selector(text=response.body)
        fieldList = selector.xpath('//li')

        degreeId = response.meta['degreeId']

        if len(fieldList):

            for field in fieldList:

                item = fieldItem()

                fieldId = field.xpath('@id').extract()[0]
                fieldName = field.xpath('./text()').extract()[0]
                fieldName = re.sub('\ue6a2', '', fieldName)

                item['id'] = fieldId
                item['name'] = fieldName
                item['degreeId'] = degreeId
                yield item

            for field in fieldList:
                fieldId = field.xpath('@id').extract()[0]
                # self.logger.debug(degreeId)
                yield SplashFormRequest(self.base_major_url,
                                        formdata={'method': 'subCategoryMl',
                                                'key': fieldId},
                                        callback=self.parse_subject,
                                        meta={'fieldId': fieldId}
                                        )
    def parse_degree(self, response):

        soup = BeautifulSoup(response.body, 'lxml')
        container = soup.find('div', attrs={'class': 'zyk-list'})

        degreeContainer = container.find('ul', attrs={'class': 'zyk-cc-ul'})
        degreeList = degreeContainer.findAll('li')

        for degree in degreeList:
            item = degreeItem()

            degreeId = degree.attrs['id']
            degreeName = degree.text
            degreeName = re.sub('\ue6a2', '', degreeName)

            item['id'] = degreeId
            item['name'] = degreeName
            yield item

        for degree in degreeList:
            degreeId = degree.attrs['id']
            # self.logger.debug(degreeId)
            yield SplashFormRequest(self.base_major_url,
                                    formdata={'method': 'subCategoryMl',
                                              'key': degreeId},
                                    callback=self.parse_field,
                                    meta={'degreeId': degreeId}
                                    )
예제 #4
0
    def login_me(self, response):
        RequestVerificationToken = response.selector.xpath(
            "//form[@id='login-form']//input[@name='__RequestVerificationToken']/@value"
        ).get("")

        # get the Captcha's options
        sitekey = response.selector.xpath(
            "//div[@class='g-recaptcha']/@data-sitekey").get("")

        gcaptcha_txt = self.solve_captcha(sitekey, response.url)
        if not gcaptcha_txt:
            return

        frm_data = {
            'Empresa': '1',
            'Email': self.e_mail,
            'Senha': self.senha,
            'g-recaptcha-response': gcaptcha_txt,
            '__RequestVerificationToken': RequestVerificationToken
        }
        print(frm_data)

        yield SplashFormRequest(self.start_url,
                                formdata=frm_data,
                                callback=self.get_main_page,
                                errback=self.errback_func,
                                endpoint='execute',
                                cache_args=['lua_source'],
                                args={'lua_source': script_10_sec_wait},
                                dont_filter=True)
    def parse_subject(self, response):

        selector = Selector(text=response.body)
        subjectList = selector.xpath('//li')

        fieldId = response.meta['fieldId']

        if len(subjectList):
            for subject in subjectList:
                item = subjectItem()

                subjectId = subject.xpath('@id').extract()[0]
                subjectName = subject.xpath('./text()').extract()[0]
                subjectName = re.sub('\ue6a2', '', subjectName)

                item['id'] = subjectId
                item['name'] = subjectName
                item['fieldId'] = fieldId
                yield item

            for subject in subjectList:
                subjectId = subject.xpath('@id').extract()[0]
                yield SplashFormRequest(self.base_major_url,
                                        formdata={'method': 'subCategoryXk',
                                                'key': subjectId},
                                        callback=self.parse_major,
                                        meta={'subjectId': subjectId}
                                        )
    def login_me(self, response):
        form_inputs = response.selector.xpath(
            "//form[.//input[@value='Entrar']]//input")
        frm_data = {}
        for inpt in form_inputs:
            inpt_name = inpt.xpath("./@name").get("")
            inpt_val = inpt.xpath("./@value").get("")
            if "urlRedirectLogin" in inpt_name:
                inpt_val = "/wps/portal/portaldetran/cidadao/infracoes/servicos/consultaMultas"
            elif "modalMensagem" in inpt_name:
                inpt_val = "Para realizar a pesquisa de débitos e restrições de veículos do proprietário,<br /> responda algumas perguntas, acesse com seu CPF e senha ou cadastre-se abaixo:"
            elif "numeroLogin" in inpt_name:
                inpt_val = self.cpf_cnpj
            elif "senhaLogin" in inpt_name:
                inpt_val = self.senha
            frm_data.update({inpt_name: inpt_val})

        url = "http://www.detran.sp.gov.br" + frm_data['javax.faces.encodedURL']
        # yield FormRequest(url, formdata=frm_data, callback=self.set_renavam, dont_filter=True)
        yield SplashFormRequest(url,
                                formdata=frm_data,
                                callback=self.set_renavam,
                                errback=self.errback_func,
                                endpoint='execute',
                                cache_args=['lua_source'],
                                args={'lua_source': script_10_sec_wait},
                                dont_filter=True)
예제 #7
0
 def start_requests(self):
     # 请求方式
     # print(self.start_urls)
     for url in self.start_urls:
         if not self.dynamic:
             if self.method.lower() == 'post':
                 request = FormRequest(url=url,
                                       formdata=self.form,
                                       headers=self.headers,
                                       cookies=self.cookies,
                                       callback=self.parse_first,
                                       dont_filter=True)
             else:
                 request = Request(url=url,
                                   headers=self.headers,
                                   cookies=self.cookies,
                                   callback=self.parse_first)
         else:
             if self.method.lower() == 'post':
                 request = SplashFormRequest(url=url,
                                             formdata=self.form,
                                             callback=self.parse_first,
                                             dont_filter=False,
                                             args=self.args_data)
             else:
                 request = SplashRequest(url,
                                         callback=self.parse_first,
                                         endpoint='execute',
                                         args=self.args_data)
         yield request
예제 #8
0
    def make_request_from_data(self, data):
        """
        :param data: redis_key中的数据
        :return: 生成 scrapy请求
        """
        scheduled = ScheduledRequest(
            **json.loads(bytes_to_str(data, self.redis_encoding)))

        callback, dont_filter = self.get_callback(scheduled.callback)
        if not callable(callback):
            raise OSError(f"{scheduled.callback}没有指定回调函数")

        params = {
            'url': scheduled.url,
            'method': scheduled.method,
            'meta': scheduled.meta,
            'dont_filter': dont_filter,
            'callback': callback
        }

        if 'splash' in scheduled.meta:
            wait = scheduled.meta.get('splash').get('wait', 2)
            images = scheduled.meta.get('splash').get('images', 0)  # 默认不下载图片
            params['args'] = {'wait': wait, 'images': images}
            if scheduled.method == "POST":
                return SplashFormRequest(formdata=scheduled.body, **params)
            else:
                return SplashRequest(**params)

        if scheduled.method == "POST":
            return FormRequest(formdata=scheduled.body, **params)
        else:
            return Request(**params)
    def start_requests(self):
        frm_data = {"email": self.e_mail, "password": self.senha}

        login_url = self.start_url + '/login/sign_in'
        yield SplashFormRequest(login_url,
                                formdata=frm_data,
                                callback=self.sign_in_me,
                                errback=self.errback_func,
                                endpoint='execute',
                                cache_args=['lua_source'],
                                args={'lua_source': script_10_sec_wait},
                                dont_filter=True)
예제 #10
0
    def parse(self, response):
        # meta['splash']['args'] 包含了发往Splash的参数。
        # meta['splash']['endpoint'] 指定了Splash所使用的endpoint,默认是render.html
        # meta['splash']['splash_url'] 覆盖了settings.py文件中配置的Splash URL
        # meta['splash']['splash_headers'] 运行你增加或修改发往Splash服务器的HTTP头部信息,注意这个不是修改发往远程web站点的HTTP头部
        # meta['splash']['dont_send_headers'] 如果你不想传递headers给Splash,将它设置成True
        # meta['splash']['slot_policy'] 让你自定义Splash请求的同步设置
        # meta['splash']['dont_process_response'] 当你设置成True后,SplashMiddleware不会修改默认的scrapy.Response请求.
        # 默认是会返回SplashResponse子类响应比如SplashTextResponse
        # meta['splash']['magic_response'] 默认为True,Splash会自动设置Response的一些属性,比如response.headers, response.body等

        # SplashFormRequest使用
        yield SplashFormRequest(response.url, self.next_parse, formdata={'name': '111'})
    def set_renavam(self, response):
        error_message = response.selector.xpath(
            "//ul[contains(@class,'alert-error') and not(@style)]/li/span/text()"
        ).get("")
        if error_message:
            error_msg = {
                "error_type": "WRONG_CREDENTIALS",
                "details": error_message
            }
            self.errors.append(error_msg)
            self.logger.warning(error_msg)
            return

        # get the Captcha's options
        sitekey = response.selector.xpath(
            "//div[@class='g-recaptcha']/@data-sitekey").get("")

        gcaptcha_txt = self.solve_captcha(sitekey, response.url)
        if not gcaptcha_txt:
            return

        renavam_form = response.selector.xpath(
            "//form[.//td[contains(text(),'Renavam')]]")
        form_name = renavam_form.xpath("./@name").get("")
        frm_data = {
            "{}:_idcl".format(form_name):
            form_name.replace("form", "btAvancar"),
            'g-recaptcha-response': gcaptcha_txt
        }
        form_inputs = renavam_form.xpath(".//input")
        for inpt in form_inputs:
            inpt_name = inpt.xpath("./@name").get("")
            inpt_val = inpt.xpath("./@value").get("")
            if ":Renavam" in inpt_name:
                inpt_val = self.renavam
            frm_data.update({inpt_name: inpt_val})

        url = "http://www.detran.sp.gov.br" + renavam_form.xpath(
            "./@action").get("")
        yield SplashFormRequest(url,
                                formdata=frm_data,
                                callback=self.get_main_page,
                                errback=self.errback_func,
                                endpoint='execute',
                                cache_args=['lua_source'],
                                args={
                                    'lua_source': script_10_sec_wait,
                                    'cookies': response.data['cookies']
                                },
                                dont_filter=True)
예제 #12
0
 def parse_affiliates(self, response):
     countries
     for country in countries:
         yield SplashFormRequest(
             url='https://www.crossfit.com/affiliate-list',
             formxpath=
             "//div[@class='form-group']/select[@id='countryFilter']",
             formdata={'option': country})
         paises = response.xpath("//table[@id='affiliateTable']/tbody/tr")
         for pais in paises:
             yield {
                 'gym name': country.xpath('.//td/a/text()').get(),
                 'local': country.xpath('.//td/text()').get(),
                 'country': country
             }
예제 #13
0
 def login_me(self, response):
     url = 'http://www.sefaz.ba.gov.br/scripts/ipva/dae/VeiculoCadastrado/ipva_texto_obter_desconto200.asp'
     frm_data = {'txt_renavam': self.renavam, 'txt_renavam1': ''}
     #yield FormRequest(url, formdata=frm_data, callback=self.get_main_page,
     #                  errback=self.errback_func, dont_filter=True)
     yield SplashFormRequest(url,
                             formdata=frm_data,
                             callback=self.get_main_page,
                             errback=self.errback_func,
                             endpoint='execute',
                             cache_args=['lua_source'],
                             args={
                                 'lua_source': script,
                                 'cookies': response.data['cookies']
                             },
                             dont_filter=True)
예제 #14
0
    def get_debito_calculado_ipva(self, response):
        valor_do_ipva = response.selector.xpath(
            "//font[contains(.,'Pagamento de cota')]/text()").get("").strip()
        ano_exercicio = response.selector.xpath(
            "//span[contains(.,'Ano Exercício')]/../input/@value").get(
                "").strip()
        data_do_vencimento = response.selector.xpath(
            "//span[contains(.,'Data do Vencimento')]/../input/@value").get(
                "").strip()
        valor_da_cota_unica = response.selector.xpath(
            "//span[contains(.,'Valor da Cota única')]/../input/@value").get(
                "").strip()
        print(valor_da_cota_unica)
        if valor_da_cota_unica:
            row_data = {
                'valor_do_ipva': valor_do_ipva,
                'ano_exercicio': ano_exercicio,
                'data_do_vencimento': data_do_vencimento,
                'valor_da_cota_unica': valor_da_cota_unica
            }

            if self.get_files:
                url = "http://www.sefaz.ba.gov.br/scripts/ipva/dae/VeiculoCadastrado/result_dae_avulso_ipva.asp"
                frm_data = {
                    'Lnum_cnpj_cpf_base': '',
                    'Lnum_cnpj_cpf_filial': '',
                    'Lnum_cnpj_cpf_digito': ''
                }
                #yield FormRequest(url, callback=self.test_file,
                #                  errback=self.errback_func, dont_filter=True)
                yield SplashFormRequest(
                    url,
                    formdata=frm_data,
                    callback=self.print_html_to_pdf,
                    #errback=self.errback_func,
                    endpoint='execute',
                    cache_args=['lua_source'],
                    args={
                        'lua_source': script,
                        'cookies': response.data['cookies']
                    },
                    meta={'row_data': row_data},
                    dont_filter=True)
            else:
                ipva_do_veiculo = self.result.get('ipva_do_veiculo', [])
                ipva_do_veiculo.append(row_data)
                self.result.update({'ipva_do_veiculo': ipva_do_veiculo})
예제 #15
0
    def get_login_page(self, response):
        """Function to get request options to login.
        Used to get ReCaptcha token; image captcha value."""

        # get the Captcha's options
        sitekey = response.selector.xpath(
            "//div[@class='g-recaptcha']/@data-sitekey").get("")

        gcaptcha_txt = self.solve_captcha(sitekey, response.url)
        if not gcaptcha_txt:
            return

        # Get options for request
        EVENTTARGET = response.selector.xpath(
            "//input[@id='__EVENTTARGET']/@value").get("")
        EVENTARGUMENT = response.selector.xpath(
            "//input[@id='__EVENTARGUMENT']/@value").get("")
        VIEWSTATE = response.selector.xpath(
            "//input[@id='__VIEWSTATE']/@value").get("")
        VIEWSTATEGENERATOR = response.selector.xpath(
            "//input[@id='__VIEWSTATEGENERATOR']/@value").get("")
        EVENTVALIDATION = response.selector.xpath(
            "//input[@id='__EVENTVALIDATION']/@value").get("")

        frm_data = {
            '__EVENTTARGET': EVENTTARGET,
            '__EVENTARGUMENT': EVENTARGUMENT,
            '__VIEWSTATE': VIEWSTATE,
            '__VIEWSTATEGENERATOR': VIEWSTATEGENERATOR,
            '__EVENTVALIDATION': EVENTVALIDATION,
            'ctl00$conteudoPaginaPlaceHolder$txtRenavam': self.renavam,
            'ctl00$conteudoPaginaPlaceHolder$txtPlaca': self.placa,
            'g-recaptcha-response': gcaptcha_txt,
            'ctl00$conteudoPaginaPlaceHolder$btn_Consultar': 'Consultar'
        }

        yield SplashFormRequest(self.start_url,
                                formdata=frm_data,
                                callback=self.login_me,
                                errback=self.errback_func,
                                endpoint='execute',
                                cache_args=['lua_source'],
                                args={
                                    'lua_source': script_10_sec_wait,
                                    'cookies': response.data['cookies']
                                },
                                dont_filter=True)
예제 #16
0
    def start_requests(self):

        tickers = [
            '600001',
            '000777',
        ]

        for ticker in tickers:  # try all tickers one by one
            if ticker[0] == '0':
                data = { # prepare for the requrest
                    'stock': ticker,
                    'searchkey': '年年度报告',
                    'category': 'category_ndbg_szsh',
                    'pageNum': '1',
                    'pageSize': '30',
                    'column': 'szse_main', # for Shenzhen stock exchange
                    'tabName': 'fulltext',
                    'sortName':'',
                    'sortType':'',
                    'limit': '',
                    'seDate': '',
                }
            elif ticker[0] == '6':
                data = { # prepare for the requrest
                    'stock': ticker,
                    'searchkey': '年年度报告',
                    'category': 'category_ndbg_szsh',
                    'pageNum': '1',
                    'pageSize': '30',
                    'column': 'sse', # for Shanghai stock exchange
                    'tabName': 'fulltext',
                    'sortName':'',
                    'sortType':'',
                    'limit': '',
                    'seDate': '',
                }
            else:
                print("Wrong ticker")
                continue

            yield SplashFormRequest(
                url='http://www.cninfo.com.cn/cninfo-new/announcement/query',
                formdata=data,
                callback=self.parse,
                #   args={'wait': 2}
            )
예제 #17
0
 def get_result_debito_ipva(self, response):
     cota3 = response.url.split("cota3=")[-1]
     url = "http://www.sefaz.ba.gov.br/scripts/ipva/dae/VeiculoCadastrado/debito_calculado_ipva.asp?cota3={}".format(
         cota3)
     today = dt.now().strftime("%d/%m/%Y")
     frm_data = {'txt_dtc_pagamento': today}
     #yield FormRequest(url, formdata=frm_data, callback=self.get_debito_calculado_ipva,
     #                  errback=self.errback_func, dont_filter=True)
     yield SplashFormRequest(url,
                             formdata=frm_data,
                             callback=self.get_debito_calculado_ipva,
                             errback=self.errback_func,
                             endpoint='execute',
                             cache_args=['lua_source'],
                             args={
                                 'lua_source': script,
                                 'cookies': response.data['cookies']
                             },
                             dont_filter=True)
예제 #18
0
    def login_me(self, response):
        login_url = "https://auth.netcombo.com.br/login"

        client_id = response.selector.xpath(
            "//input[@name='client_id']/@value").get("")
        redirect_uri = response.selector.xpath(
            "//input[@name='redirect_uri']/@value").get("")
        response_type = response.selector.xpath(
            "//input[@name='response_type']/@value").get("")
        scope = response.selector.xpath("//input[@name='scope']/@value").get(
            "")
        state = response.selector.xpath("//input[@name='state']/@value").get(
            "")
        authMs = response.selector.xpath("//input[@name='authMs']/@value").get(
            "")

        frm_data = {
            'Username': self.login,
            'password': self.senha,
            'client_id': client_id,
            'redirect_uri': redirect_uri,
            'response_type': response_type,
            'scope': scope,
            'state': state,
            'authMs': authMs,
            'Auth_method': 'UP'
        }
        print(frm_data)

        yield SplashFormRequest(login_url,
                                formdata=frm_data,
                                callback=self.select_contract,
                                errback=self.errback_func,
                                endpoint='execute',
                                cache_args=['lua_source'],
                                args={
                                    'lua_source': script_30_sec_wait,
                                    'cookies': response.data['cookies'],
                                    'timeout': 60,
                                    'images': 0
                                },
                                dont_filter=True)
예제 #19
0
    def get_ipva_search(self, response):
        action = response.selector.xpath("//input[@id='action']/@value").get("")
        csrf_token = response.selector.xpath("//input[@id='csrf_token']/@value").get("")
        sitekey = response.selector.xpath(
            "//script[contains(@src,'recaptcha/api')]/@src").get("").split("render=")[-1]
        gcaptcha_txt = self.solve_captcha(sitekey, response.request.url,
                                          captcha_type=5,
                                          captcha_action='portal_consulta_renavam')
        if not gcaptcha_txt:
            return

        frm_data = {'action': action,
                    'renavam': response.meta['renavam'],
                    'csrf_token': csrf_token,
                    'recaptcha_response': gcaptcha_txt}
        ipva_url = "https://portalcontribuinte.sefin.ro.gov.br/Publico/__Resultado_Renavam_.jsp"
        yield SplashFormRequest(ipva_url, formdata=frm_data,
                                endpoint='render.json', args=self.splash_args,
                                meta={'renavam': response.meta['renavam']},
                                callback=self.get_ipva_result, dont_filter=True)
예제 #20
0
    def login_me(self, response):
        """Function to get request options to login.
        Used to get ReCaptcha token; image captcha value."""

        # get the Captcha's options
        sitekey = response.selector.xpath(
            "//div[@class='g-recaptcha']/@data-sitekey").get("")

        gcaptcha_txt = self.solve_captcha(sitekey, response.url)
        if not gcaptcha_txt:
            return

        frm_data = {
            'Renavam': self.renavam,
            'Placa': self.placa,
            'g-recaptcha-response': gcaptcha_txt}

        # yield FormRequest(self.start_url, formdata=frm_data, callback=self.get_main_page, dont_filter=True)
        yield SplashFormRequest(self.start_url, formdata=frm_data,
                                callback=self.get_main_page,
                                errback=self.errback_func,
                                endpoint='execute', cache_args=['lua_source'],
                                args={'lua_source': autos_detran_ro_script}, dont_filter=True)
예제 #21
0
    def search_result(self, response):
        if response.css("#ctl00_RadWindow1_C_btnOk"):

            form_data = dict()
            input_fields = response.css("form input")
            for ifield in input_fields:
                if ifield.css("input::attr(name)").extract_first() == None:
                    continue
                if ifield.css("input::attr(value)").extract_first() == None:
                    form_data[ifield.css(
                        "input::attr(name)").extract_first()] = ''
                elif ifield.css(
                        "input::attr(value)").extract_first() == 'I Disagree':
                    continue
                elif ifield.css("input::attr(value)").extract_first() == '':
                    form_data[ifield.css(
                        "input::attr(name)").extract_first()] = ''
                else:
                    form_data[ifield.css("input::attr(name)").extract_first(
                    )] = ifield.css("input::attr(value)").extract_first()

            yield SplashFormRequest(
                url=
                'https://pdonline.brisbane.qld.gov.au/MasterPlan/Modules/Enquirer/PropertySearch.aspx',
                formdata=form_data,
                callback=self.search_result,
                endpoint='execute',
                args={
                    'timeout': '90',
                    'wait': '30',
                    'lua_source': script,
                },
                session_id='1',
                meta={
                    'input_data': response.meta["input_data"],
                })
        else:
            # Cookie handling
            for cookie in response.data["cookies"]:
                self.dynamic_cookie[cookie["name"]] = cookie["value"]

            form_data = dict()
            input_fields = response.css("form input")
            for ifield in input_fields:
                if ifield.css("input::attr(name)").extract_first() == None:
                    continue
                if ifield.css("input::attr(value)").extract_first() == None:
                    form_data[ifield.css(
                        "input::attr(name)").extract_first()] = ''
                else:
                    form_data[ifield.css("input::attr(name)").extract_first(
                    )] = ifield.css("input::attr(value)").extract_first()
            for key in response.meta["input_data"]:
                #  if response.meta["input_data"][key]:

                if key == "Suburb":
                    form_data["ctl00$MainContent$SuburbCombo"] = response.meta[
                        "input_data"][key]
                elif key == "Street Name":
                    street_name_define = {
                        "logEntries": [],
                        "value": "",
                        "text": "",
                        "enabled": True,
                        "checkedIndices": [],
                        "checkedItemsTextOverflows": False
                    }
                    street_name_define["value"] = response.meta["input_data"][
                        key]
                    street_name_define["text"] = response.meta["input_data"][
                        key]
                    form_data[
                        "ctl00_MainContent_StreetCombo_ClientState"] = street_name_define
                    form_data["ctl00$MainContent$StreetCombo"] = response.meta[
                        "input_data"][key]
                elif key == "Unit Number From":
                    form_data[
                        "ctl00$MainContent$FromUnitNumberTextBox"] = response.meta[
                            "input_data"][key]
                    ctl00_MainContent_FromUnitNumberTextBox_ClientState = {
                        "enabled": True,
                        "emptyMessage": "",
                        "validationText": "",
                        "valueAsString": "",
                        "lastSetTextBoxValue": ""
                    }
                    ctl00_MainContent_FromUnitNumberTextBox_ClientState[
                        "validationText"] = response.meta["input_data"][key]
                    ctl00_MainContent_FromUnitNumberTextBox_ClientState[
                        "valueAsString"] = response.meta["input_data"][key]
                    ctl00_MainContent_FromUnitNumberTextBox_ClientState[
                        "lastSetTextBoxValue"] = response.meta["input_data"][
                            key]
                    form_data[
                        "ctl00_MainContent_FromUnitNumberTextBox_ClientState"] = ctl00_MainContent_FromUnitNumberTextBox_ClientState
                elif key == "Unit Number To":
                    form_data[
                        "ctl00$MainContent$ToUnitNumberTextBox"] = response.meta[
                            "input_data"][key]
                    ctl00_MainContent_ToUnitNumberTextBox_ClientState = {
                        "enabled": True,
                        "emptyMessage": "",
                        "validationText": "",
                        "valueAsString": "",
                        "lastSetTextBoxValue": ""
                    }
                    ctl00_MainContent_ToUnitNumberTextBox_ClientState[
                        "validationText"] = response.meta["input_data"][key]
                    ctl00_MainContent_ToUnitNumberTextBox_ClientState[
                        "valueAsString"] = response.meta["input_data"][key]
                    ctl00_MainContent_ToUnitNumberTextBox_ClientState[
                        "lastSetTextBoxValue"] = response.meta["input_data"][
                            key]
                    form_data[
                        "ctl00_MainContent_ToUnitNumberTextBox_ClientState"] = ctl00_MainContent_ToUnitNumberTextBox_ClientState
                elif key == "Street Number From":
                    form_data[
                        "ctl00$MainContent$FromStreetNumberTextBox"] = response.meta[
                            "input_data"][key]
                    ctl00_MainContent_FromStreetNumberTextBox_ClientState = {
                        "enabled": True,
                        "emptyMessage": "",
                        "validationText": "",
                        "valueAsString": "",
                        "lastSetTextBoxValue": ""
                    }
                    ctl00_MainContent_FromStreetNumberTextBox_ClientState[
                        "validationText"] = response.meta["input_data"][key]
                    ctl00_MainContent_FromStreetNumberTextBox_ClientState[
                        "valueAsString"] = response.meta["input_data"][key]
                    ctl00_MainContent_FromStreetNumberTextBox_ClientState[
                        "lastSetTextBoxValue"] = response.meta["input_data"][
                            key]
                    form_data[
                        "ctl00_MainContent_FromStreetNumberTextBox_ClientState"] = ctl00_MainContent_FromStreetNumberTextBox_ClientState
                elif key == "Street Number To":
                    form_data[
                        "ctl00$MainContent$ToStreetNumberTextBox"] = response.meta[
                            "input_data"][key]
                    ctl00_MainContent_ToStreetNumberTextBox_ClientState = {
                        "enabled": True,
                        "emptyMessage": "",
                        "validationText": "",
                        "valueAsString": "",
                        "lastSetTextBoxValue": ""
                    }
                    ctl00_MainContent_ToStreetNumberTextBox_ClientState[
                        "validationText"] = response.meta["input_data"][key]
                    ctl00_MainContent_ToStreetNumberTextBox_ClientState[
                        "valueAsString"] = response.meta["input_data"][key]
                    ctl00_MainContent_ToStreetNumberTextBox_ClientState[
                        "lastSetTextBoxValue"] = response.meta["input_data"][
                            key]
                    form_data[
                        "ctl00_MainContent_ToStreetNumberTextBox_ClientState"] = ctl00_MainContent_ToStreetNumberTextBox_ClientState

                elif key == "Plan Number":
                    form_data["ctl00$MainContent$PlanTextBox"] = response.meta[
                        "input_data"][key]
                    ctl00_MainContent_PlanTextBox_ClientState = {
                        "enabled": True,
                        "emptyMessage": "",
                        "validationText": "",
                        "valueAsString": "",
                        "lastSetTextBoxValue": ""
                    }
                    ctl00_MainContent_PlanTextBox_ClientState[
                        "validationText"] = response.meta["input_data"][key]
                    ctl00_MainContent_PlanTextBox_ClientState[
                        "valueAsString"] = response.meta["input_data"][key]
                    ctl00_MainContent_PlanTextBox_ClientState[
                        "lastSetTextBoxValue"] = response.meta["input_data"][
                            key]
                elif key == "Lot Number":
                    form_data["ctl00$MainContent$LotTextBox"] = response.meta[
                        "input_data"][key]
                    ctl00_MainContent_LotTextBox_ClientState = {
                        "enabled": True,
                        "emptyMessage": "",
                        "validationText": "",
                        "valueAsString": "",
                        "lastSetTextBoxValue": ""
                    }
                    ctl00_MainContent_LotTextBox_ClientState[
                        "validationText"] = response.meta["input_data"][key]
                    ctl00_MainContent_LotTextBox_ClientState[
                        "valueAsString"] = response.meta["input_data"][key]
                    ctl00_MainContent_LotTextBox_ClientState[
                        "lastSetTextBoxValue"] = response.meta["input_data"][
                            key]
                    form_data[
                        "ctl00_MainContent_LotTextBox_ClientState"] = ctl00_MainContent_LotTextBox_ClientState
                else:
                    pass

            for key in form_data:
                if isinstance(form_data[key], dict):
                    form_data[key] = json.dumps(form_data[key])
                if isinstance(form_data[key], int):
                    form_data[key] = str(form_data[key])

            # first method that i used to submit form
            yield FormRequest(
                url=
                'https://pdonline.brisbane.qld.gov.au/MasterPlan/Modules/Enquirer/PropertySearch.aspx',
                formdata=form_data,
                callback=self.submit_form,
                cookies=self.dynamic_cookie)

            # Second method that i have used to submit form
            yield SplashFormRequest(
                url=
                'https://pdonline.brisbane.qld.gov.au/MasterPlan/Modules/Enquirer/PropertySearch.aspx',
                formdata=form_data,
                callback=self.submit_form,
                endpoint='execute',
                args={
                    'timeout': '90',
                    'wait': '30',
                    'lua_source': script,
                },
                session_id='1')
예제 #22
0
    def login_me(self, response):
        renavam = response.selector.xpath(
            "//span[@id='conteudoPaginaPlaceHolder_txtRenavam']/text()").get(
                "").strip()
        placa = response.selector.xpath(
            "//span[@id='conteudoPaginaPlaceHolder_txtPlaca']/text()").get(
                "").strip()
        print("renavam:", renavam)
        print("placa:", placa)

        error_message = response.selector.xpath(
            "//span[@id='conteudoPaginaPlaceHolder_lblErro']/text()").get("")
        if "Preencha o campo 'Placa' corretamente." in error_message:
            error_msg = {
                "error_type": "WRONG_CREDENTIALS",
                "details": error_message
            }
            self.errors.append(error_msg)
            self.logger.warning(error_msg)
            return
        elif "Favor validar o captcha corretamente" in error_message:
            self.incorrect_captcha_report(self.captcha_service,
                                          self.g_recaptcha_id)
            if self.incorrect_captcha_retries > 0:
                yield SplashRequest(self.start_url,
                                    callback=self.get_login_page,
                                    errback=self.errback_func,
                                    endpoint='execute',
                                    cache_args=['lua_source'],
                                    args={'lua_source': script_10_sec_wait},
                                    dont_filter=True)
            return

        # create screenshot using imgkit
        if self.capture_screenshot:
            self.take_screenshot(response, url_path='IPVANET_Consulta')

        regex = re.compile(r'\s+')
        marca_modelo = regex.sub(
            " ",
            response.selector.xpath(
                "//span[@id='conteudoPaginaPlaceHolder_txtMarcaModelo']/text()"
            ).get("").strip())
        faixa_do_ipva = regex.sub(
            " ",
            response.selector.xpath(
                "//span[@id='conteudoPaginaPlaceHolder_txtFaixaIPVA']/text()").
            get("").strip())
        ano_de_fabricacao = regex.sub(
            " ",
            response.selector.xpath(
                "//span[@id='conteudoPaginaPlaceHolder_txtAnoFabric']/text()").
            get("").strip())
        municipio = regex.sub(
            " ",
            response.selector.xpath(
                "//span[@id='conteudoPaginaPlaceHolder_txtMunicipio']/text()").
            get("").strip())
        combustivel = regex.sub(
            " ",
            response.selector.xpath(
                "//span[@id='conteudoPaginaPlaceHolder_txtCombustivel']/text()"
            ).get("").strip())
        especie = regex.sub(
            " ",
            response.selector.xpath(
                "//span[@id='conteudoPaginaPlaceHolder_txtEspecie']/text()").
            get("").strip())
        categoria = regex.sub(
            " ",
            response.selector.xpath(
                "//span[@id='conteudoPaginaPlaceHolder_txtCategoria']/text()").
            get("").strip())
        tipo = regex.sub(
            " ",
            response.selector.xpath(
                "//span[@id='conteudoPaginaPlaceHolder_txtTipo']/text()").get(
                    "").strip())
        passageiros = regex.sub(
            " ",
            response.selector.xpath(
                "//span[@id='conteudoPaginaPlaceHolder_txtPassageiros']/text()"
            ).get("").strip())
        carroceria = regex.sub(
            " ",
            response.selector.xpath(
                "//span[@id='conteudoPaginaPlaceHolder_txtCarroceria']/text()"
            ).get("").strip())
        ultimo_licenciamento = regex.sub(
            " ",
            response.selector.xpath(
                "//span[@id='conteudoPaginaPlaceHolder_txtAnoUltLicen']/text()"
            ).get("").strip())
        self.result.update({
            'renavam': renavam,
            'placa': placa,
            'marca_modelo': marca_modelo,
            'faixa_do_ipva': faixa_do_ipva,
            'ano_de_fabricacao': ano_de_fabricacao,
            'municipio': municipio,
            'combustivel': combustivel,
            'especie': especie,
            'categoria': categoria,
            'tipo': tipo,
            'passageiros': passageiros,
            'carroceria': carroceria,
            'ultimo_licenciamento': ultimo_licenciamento
        })

        tables = response.selector.xpath(
            "//div[@id='conteudoPaginaPlaceHolder_Panel1']/table[.//td[@class='alinharEsquerda, negrito' and ./span[contains(@id,'conteudoPaginaPlaceHolder_Label') and not(text()=' ')]]]"
        )
        for table in tables[1:-1]:
            table_name = " ".join([
                t.strip() for t in table.xpath(
                    ".//td[@class='alinharEsquerda, negrito' and ./span[contains(@id,'conteudoPaginaPlaceHolder_Label') and not(text()=' ')]]/span/text()"
                ).extract()
            ])
            table_name = self.remove_diacritics(table_name)

            main_table = table.xpath(
                "./following::table[@class='loginTable' and .//tr[not(@class) and ./td[not(@class)]/span]][1]"
            )
            rows = main_table.xpath(".//tr[not(@class)]")

            if table_name == "ipva_2020":
                table_content = {}
                for row in rows:
                    title = self.remove_diacritics(
                        row.xpath("./td[1]/span/text()").get("").strip())
                    value = re.sub(
                        '\s+', " ", " ".join(
                            row.xpath(
                                "./td[last()]/span/text()").extract()).strip())
                    if title:
                        table_content.update({title: value})
                self.result.update({table_name: table_content})

                second_table = main_table.xpath(
                    "./following::table[@class='loginTable'][1]//tr[1]/td/span"
                )
                st_name = self.remove_diacritics(
                    second_table.xpath("./text()").get("").strip())

                st_rows = second_table.xpath(
                    "./following::table[@class='loginTable'][1]//tr")
                table_content = {}
                for row in st_rows[1:]:
                    title = row.xpath("./td[1]/span/text()").get("").strip()
                    date = row.xpath("./td[2]/span/text()").get("").strip()
                    value = re.sub(
                        '\s+', " ", " ".join(
                            row.xpath(
                                "./td[last()]/span/text()").extract()).strip())
                    if title and date:
                        table_content.update({
                            'modalidades_disponiveis': title,
                            'vencimento': date,
                            'valor': value
                        })
                if table_content:
                    self.result.update({st_name: table_content})

            elif table_name != "ipva_2020" and table_name != 'taxas':
                table_content = []
                for row in rows[1:]:
                    exercicio = row.xpath("./td[1]/span/text()").get(
                        "").strip()
                    valor = row.xpath("./td[last()]/span/text()").get(
                        "").strip()
                    is_valor_table = rows[0].xpath(
                        ".//span[contains(text(),'Valor')]")
                    if exercicio:
                        rows_data = {'exercicio': exercicio}
                        if is_valor_table:
                            rows_data.update({'valor': valor})
                        table_content.append(rows_data)
                self.result.update({table_name: table_content})

            elif table_name == "taxas":
                table_content = {}
                taxas_type = rows[0].xpath("./td[1]/span/text()").get(
                    "").strip()
                table_content.update({'type': taxas_type})
                # workaround for different types
                is_nada_costa = rows[1].xpath(
                    ".//span[contains(text(),'NADA CONSTA')]")
                if is_nada_costa:
                    i = 1
                else:
                    i = 2
                for row in rows[i:]:
                    title = self.remove_diacritics(
                        row.xpath("./td[1]/span/text()").get("").strip())
                    value = row.xpath("./td[last()]/span/text()").get(
                        "").strip()
                    if title:
                        table_content.update({title: value})
                self.result.update({table_name: table_content})

        multas_btn = response.selector.xpath("//input[contains(@id,'Multas')]")
        if multas_btn:
            # Get options for request
            multas_btn_name = multas_btn.xpath("./@name").get("").strip()
            multas_btn_value = multas_btn.xpath("./@value").get("").strip()
            EVENTTARGET = response.selector.xpath(
                "//input[@id='__EVENTTARGET']/@value").get("")
            EVENTARGUMENT = response.selector.xpath(
                "//input[@id='__EVENTARGUMENT']/@value").get("")
            VIEWSTATE = response.selector.xpath(
                "//input[@id='__VIEWSTATE']/@value").get("")
            VIEWSTATEGENERATOR = response.selector.xpath(
                "//input[@id='__VIEWSTATEGENERATOR']/@value").get("")
            EVENTVALIDATION = response.selector.xpath(
                "//input[@id='__EVENTVALIDATION']/@value").get("")

            frm_data = {
                '__EVENTTARGET': EVENTTARGET,
                '__EVENTARGUMENT': EVENTARGUMENT,
                '__VIEWSTATE': VIEWSTATE,
                '__VIEWSTATEGENERATOR': VIEWSTATEGENERATOR,
                '__EVENTVALIDATION': EVENTVALIDATION
            }
            frm_data.update({multas_btn_name: multas_btn_value})

            multas_url = "https://www.ipva.fazenda.sp.gov.br/IPVANET_Consulta/Pages/Aviso.aspx"
            yield SplashFormRequest(multas_url,
                                    formdata=frm_data,
                                    callback=self.multas_table,
                                    errback=self.errback_func,
                                    endpoint='execute',
                                    cache_args=['lua_source'],
                                    args={
                                        'lua_source': script_10_sec_wait,
                                        'cookies': response.data['cookies']
                                    },
                                    dont_filter=True)
        else:
            self.result['multas'] = []
예제 #23
0
    def get_main_page(self, response):
        """Redirect to main page."""

        error_message = response.selector.xpath("//div[@class='msgErro']/text()").get("")
        print(error_message)
        if "Nenhum registro encontrado, verifique os dados digitados" in error_message:
            error_msg = {"error_type": "WRONG_CREDENTIALS",
                         "details": error_message}
            self.errors.append(error_msg)
            self.logger.warning(error_msg)
            return
        elif "Confirme que você não é um robô." in error_message:
            self.incorrect_captcha_report(
                self.captcha_service, self.g_recaptcha_id)
            if self.incorrect_captcha_retries > 0:
                yield Request(self.start_url, callback=self.login_me,
                              meta={'dont_merge_cookies': True}, dont_filter=True)
            return

        regex = re.compile(r'\s+')
        self.result['placa'] = regex.sub(" ", response.selector.xpath("//span[text()='Placa']/../text()").get("").strip())
        self.result['marca_modelo'] = regex.sub(" ", response.selector.xpath("//span[text()='Marca/Modelo']/../text()").get("").strip())
        self.result['fabricacao_modelo'] = regex.sub(" ", response.selector.xpath("//span[text()='Fabricacao/Modelo']/../text()").get("").strip())
        self.result['cor'] = regex.sub(" ", response.selector.xpath("//span[text()='Cor']/../text()").get("").strip())
        renavam = regex.sub(" ", response.selector.xpath("//span[text()='Renavam']/../text()").get("").strip())
        self.result['renavam'] = renavam
        self.result['tipo'] = regex.sub(" ", response.selector.xpath("//span[text()='Tipo']/../text()").get("").strip())
        self.result['carroceria'] = regex.sub(" ", response.selector.xpath("//span[text()='Carroceria']/../text()").get("").strip())
        self.result['especie'] = regex.sub(" ", response.selector.xpath("//span[text()='Especie']/../text()").get("").strip())
        self.result['lugares'] = regex.sub(" ", response.selector.xpath("//span[text()='Lugares']/../text()").get("").strip())
        self.result['categoria'] = regex.sub(" ", response.selector.xpath("//span[text()='Categoria']/../text()").get("").strip())
        self.result['potencia'] = regex.sub(" ", response.selector.xpath("//span[text()='Potência']/../text()").get("").strip())
        self.result['combustivel'] = regex.sub(" ", response.selector.xpath("//span[text()='Combustível']/../text()").get("").strip())
        self.result['nome_do_proprietario'] = regex.sub(" ", response.selector.xpath("//span[text()='Nome do Proprietário']/../text()").get("").strip())
        self.result['situacao_lacre'] = regex.sub(" ", response.selector.xpath("//span[text()='Situação Lacre']/../text()").get("").strip())
        self.result['proprietario_anterior'] = regex.sub(" ", response.selector.xpath("//span[text()='Proprietário Anterior']/../text()").get("").strip())
        self.result['origem_dos_dados_do_veiculo'] = regex.sub(" ", response.selector.xpath("//span[text()='Origem dos Dados do Veículo']/../text()").get("").strip())
        self.result['placa_anterior'] = regex.sub(" ", response.selector.xpath("//span[text()='Placa Anterior']/../text()").get("").strip())
        self.result['municipio_de_emplacamento'] = regex.sub(" ", response.selector.xpath("//span[text()='Municipio de Emplacamento']/../text()").get("").strip())
        self.result['licenciado_ate'] = regex.sub(" ", response.selector.xpath("//span[text()='Licenciado ate']/../text()").get("").strip())
        self.result['adquirido_em'] = regex.sub(" ", response.selector.xpath("//span[text()='Adquirido em']/../text()").get("").strip())
        self.result['situacao'] = regex.sub(" ", response.selector.xpath("//span[text()='Situação']/../text()").get("").strip())
        self.result['restricao_a_venda'] = regex.sub(" ", response.selector.xpath("//span[text()='Restrição a Venda']/../text()").get("").strip())
        self.result['informacoes_pendentes_originadas_das_financeiras_via_sng_sistema_nacional_de_gravame'] = regex.sub(" ", response.selector.xpath(
            "//span[text()='Informações PENDENTES originadas das financeiras via SNG - Sistema Nacional de Gravame']/../text()").get("").strip())
        self.result['impedimentos'] = regex.sub(" ", response.selector.xpath("//span[text()='Impedimentos']/../text()").get("").strip())

        debitos_rows = response.selector.xpath("//div[@id='corpo_DebitosVeiculo']/div[@id='Integral']/table[@id='TabelaIntegral']/tbody/tr")
        debitos = []
        for row in debitos_rows:
            descricao = regex.sub(" ", row.xpath("./td[1]/text()").get("").strip())
            vencimento = regex.sub(" ", row.xpath("./td[2]/text()").get("").strip())
            nominal_r = regex.sub(" ", row.xpath("./td[3]/text()").get("").strip())
            corrigido_r = regex.sub(" ", row.xpath("./td[4]/text()").get("").strip())
            desconto_r = regex.sub(" ", row.xpath("./td[5]/text()").get("").strip())
            juros_r = regex.sub(" ", row.xpath("./td[6]/text()").get("").strip())
            multa_r = regex.sub(" ", row.xpath("./td[7]/text()").get("").strip())
            atual_r = regex.sub(" ", row.xpath("./td[8]/text()").get("").strip())
            debitos.append({
                'descricao': descricao,
                'vencimento': vencimento,
                'nominal_r': nominal_r,
                'corrigido_r': corrigido_r,
                'desconto_r': desconto_r,
                'juros_r': juros_r,
                'multa_r': multa_r,
                'atual_r': atual_r})
        self.result.update({'debitos': debitos})

        infracoes_em_autuacao_rows = response.selector.xpath(
            "//div[@id='corpo_AutuacoesVeiculo']//tbody/tr")
        infracoes_em_autuacao = []
        for row in infracoes_em_autuacao_rows:
            num_auto = " ".join([s.strip() for s in row.xpath("./td[1]/text()").extract() if s.strip()])
            if "veículo até o momento." in num_auto:
                break
            status = regex.sub(" ", " ".join(row.xpath("./td[2]/text()").extract()).strip())
            descricao = regex.sub(" ", " ".join(row.xpath("./td[3]/text()").extract()).strip())
            local_complemento = regex.sub(" ", " ".join(row.xpath("./td[4]/text()").extract()).strip())
            valor = regex.sub(" ", " ".join(row.xpath("./td[5]/text()").extract()).strip())
            infracoes_em_autuacao.append({
                'num_auto': num_auto,
                'status': status,
                'descricao': descricao,
                'local_complemento': local_complemento,
                'valor': valor})
        self.result.update({'infracoes_em_autuacao': infracoes_em_autuacao})

        penalidades_multas_rows = response.selector.xpath(
            "//div[@id='corpo_MultasVeiculo']//tbody/tr")
        penalidades_multas = []
        for row in penalidades_multas_rows:
            num_auto = regex.sub(" ", " ".join(row.xpath("./td[1]/text()").extract()).strip())
            if "veículo até o momento." in num_auto:
                break
            status = regex.sub(" ", " ".join(row.xpath("./td[2]/text()").extract()).strip())
            descricao = regex.sub(" ", " ".join(row.xpath("./td[3]/text()").extract()).strip())
            local_complemento = regex.sub(" ", " ".join(row.xpath("./td[4]/text()").extract()).strip())
            valor = regex.sub(" ", " ".join(row.xpath("./td[5]/text()").extract()).strip())
            penalidades_multas.append({
                'num_auto': num_auto,
                'status': status,
                'descricao': descricao,
                'local_complemento': local_complemento,
                'valor': valor})
        self.result.update({'penalidades_multas': penalidades_multas})

        recursos_infracao_rows = response.selector.xpath(
            "//div[@id='corpo_RecursosInfracao']//tbody/tr")
        recursos_infracao = []
        for row in recursos_infracao_rows:
            processo = regex.sub(" ", " ".join(row.xpath("./td[1]/text()").extract()).strip())
            if "veículo até o momento." in processo:
                break
            n_proc_renainf = regex.sub(" ", " ".join(row.xpath("./td[2]/text()").extract()).strip())
            numero_do_auto = regex.sub(" ", " ".join(row.xpath("./td[3]/text()").extract()).strip())
            detalhamento_da_infracao = regex.sub(" ", " ".join(row.xpath("./td[4]/text()").extract()).strip())
            situacao_do_processo = regex.sub(" ", " ".join(row.xpath("./td[5]/text()").extract()).strip())
            recursos_infracao.append({
                'processo': processo,
                'nº_proc_renainf': n_proc_renainf,
                'numero_do_auto': numero_do_auto,
                'detalhamento_da_infracao': detalhamento_da_infracao,
                'situacao_do_processo': situacao_do_processo})
        self.result.update({'recursos_infracao': recursos_infracao})

        dare_btn = response.selector.xpath("//input[@id='BotaoIntegral']")
        if self.get_files and dare_btn:
            dare_url = "https://consulta.detran.ro.gov.br/CentralDeConsultasInternet/Internet/DARE.asp"
            hdListaIdDebitos = response.selector.xpath("//input[@name='hdListaIdDebitos']/@value").get("")
            hdPlaca = response.selector.xpath("//input[@name='hdPlaca']/@value").get("")
            frm_data = {'hdListaIdDebitos': hdListaIdDebitos,
                        'hdPlaca': hdPlaca}
            print(frm_data)
            yield SplashFormRequest(dare_url, formdata=frm_data,
                                    endpoint='render.json', args=self.splash_args,
                                    meta={'result_key': 'debitos'},
                                    callback=self.print_html_to_pdf, dont_filter=True)

        ipva_url = "https://portalcontribuinte.sefin.ro.gov.br/Publico/ConsultaRenavam.jsp?renavam={}".format(renavam)
        yield Request(ipva_url, callback=self.get_ipva_search, meta={'renavam': renavam}, dont_filter=True)
    def get_main_page(self, response):
        error_message = response.selector.xpath(
            "//ul[contains(@class,'alert-error') and not(@style)]/li/span/text()"
        ).get("")
        if error_message:
            error_msg = {
                "error_type": "WRONG_CREDENTIALS",
                "details": error_message
            }
            self.errors.append(error_msg)
            self.logger.error(error_msg)
            return

        tables = response.selector.xpath("//table[@class='tabela']")
        regex = re.compile(r'\s+')
        for table in tables:
            title = self.remove_diacritics(
                regex.sub(
                    " ",
                    table.xpath(
                        ".//tbody/tr/td[not(./span[@id])]/strong/text()").get(
                            "")))
            if "licenciamento_digital" in title:
                continue
            elif "laudo_de_vistoria" in title:
                rows = response.selector.xpath(
                    "//table[@class='tableResultadoLaudo']/tbody/tr")
                table_content = []
                regex = re.compile(r'\s+')
                for row in rows:
                    data_da_vistoria = regex.sub(
                        " ",
                        " ".join(row.xpath(
                            "./td[1]/span//text()").extract()).strip())
                    empresa_responsavel = regex.sub(
                        " ",
                        " ".join(row.xpath(
                            "./td[2]/span//text()").extract()).strip())
                    km = regex.sub(
                        " ",
                        " ".join(row.xpath(
                            "./td[3]/span//text()").extract()).strip())
                    resultado_da_vistoria = regex.sub(
                        " ",
                        " ".join(row.xpath(
                            "./td[4]/span//text()").extract()).strip())
                    motivo_resultado = regex.sub(
                        " ",
                        " ".join(row.xpath(
                            "./td[5]/span//text()").extract()).strip())
                    situacao = regex.sub(
                        " ",
                        " ".join(row.xpath(
                            "./td[6]/span//text()").extract()).strip())
                    table_content.append({
                        'data_da_vistoria': data_da_vistoria,
                        'empresa_responsavel': empresa_responsavel,
                        'km': km,
                        'resultado_da_vistoria': resultado_da_vistoria,
                        'motivo_resultado': motivo_resultado,
                        'situacao': situacao
                    })
                if table_content:
                    self.result[title] = table_content
                else:
                    self.result[
                        "sem_laudo_de_vistoria"] = "Não existem vistorias eletrônicas realizadas no estado de São Paulo para o veiculo."
            else:
                rows = table.xpath(".//tbody/tr[.//span[@id]]/td")
                table_content = {}
                for row in rows:
                    key = self.remove_diacritics(
                        regex.sub(
                            " ",
                            row.xpath("./strong/text()").get("").strip()))
                    value = regex.sub(
                        " ", " ".join(
                            row.xpath("./span//text() | ./text() | ./a/text()"
                                      ).extract()).strip())
                    table_content[key] = value
                if table_content:
                    self.result[title] = table_content

        # get formdata
        multas_form = response.selector.xpath("//div[@class='container']/form")
        form_name = multas_form.xpath("./@name").get("")
        frm_data = {}
        form_inputs = multas_form.xpath(".//input")
        for inpt in form_inputs:
            inpt_name = inpt.xpath("./@name").get("")
            inpt_val = inpt.xpath("./@value").get("")
            frm_data.update({inpt_name: inpt_val})

        # get_detalhes_das_multas
        btn_id = response.selector.xpath(
            "//table[@class='tabela' and .//strong[contains(text(),'Multas')]]//a/@id"
        ).get("")
        if btn_id:
            frm_data_copy = frm_data.copy()
            frm_data_copy.update({"{}:_idcl".format(form_name): btn_id})
            url = "http://www.detran.sp.gov.br" + multas_form.xpath(
                "./@action").get("")
            yield SplashFormRequest(url,
                                    formdata=frm_data_copy,
                                    callback=self.get_detalhes_das_multas,
                                    errback=self.errback_func,
                                    endpoint='execute',
                                    cache_args=['lua_source'],
                                    args={
                                        'lua_source': script_10_sec_wait,
                                        'cookies': response.data['cookies']
                                    },
                                    dont_filter=True)

        # get file
        if self.get_files:
            btn_id = response.selector.xpath("//a[@title='Imprimir']/@id").get(
                "")
            if btn_id:
                frm_data_copy = frm_data.copy()
                frm_data_copy.update({
                    form_name: form_name,
                    'javax.faces.behavior.event': 'click',
                    'javax.faces.partial.event': 'click',
                    'javax.faces.source': btn_id,
                    'javax.faces.partial.ajax': 'true',
                    'javax.faces.partial.execute': btn_id
                })
                url = "http://www.detran.sp.gov.br" + frm_data_copy[
                    'javax.faces.encodedURL']
                yield SplashFormRequest(url,
                                        formdata=frm_data_copy,
                                        callback=self.downoad_request,
                                        errback=self.errback_func,
                                        endpoint='execute',
                                        cache_args=['lua_source'],
                                        args={
                                            'lua_source': script_10_sec_wait,
                                            'cookies': response.data['cookies']
                                        },
                                        dont_filter=True)
예제 #25
0
    def parse(self, response):
        """响应处理函数"""
        # 判断响应结果
        # if not isinstance(response, HtmlResponse):
        if not isinstance(response, Response):
            self.logger.info("non-HTML response is skipped: %s", response.url)
            return
        # 根目录选择器
        root_selectors = [
            i for i in self.selectors if i["parentSelectors"] == "_root"
        ]
        # 结果集
        result = {}
        # 翻页url
        link = ''
        # 遍历根目录选择器
        for root_selector in root_selectors:
            # text类型选择器
            if root_selector["type"] == "SelectorText":
                text = self.text_resolve(root_selector, response)
                # 非多元素
                if text and not root_selector["multiple"]:
                    text = [text[0]]
                # 正则过滤
                if 'regex' in root_selector and root_selector["regex"]:
                    text = [
                        re.findall(root_selector["regex"], i)[0] if re.findall(
                            root_selector["regex"], i) else "" for i in text
                    ]
                result[root_selector['name']] = text if root_selector[
                    "multiple"] or not text else text[0]
            # image类型选择器
            elif root_selector["type"] == "SelectorImage":
                image = self.image_resolve(root_selector, response,
                                           response.url)
                # 非多元素
                if image and not root_selector["multiple"]:
                    image = [image[0]]
                result[root_selector['name']] = image if root_selector[
                    "multiple"] or not image else image[0]
            # attribute类型选择器
            elif root_selector["type"] == "SelectorElementAttribute":
                attribute = self.attribute_resolve(root_selector, response)
                # 非多元素
                if attribute and not root_selector["multiple"]:
                    attribute = [attribute[0]]
                # 正则过滤
                if 'regex' in root_selector and root_selector["regex"]:
                    attribute = [
                        re.findall(root_selector["regex"], i)[0] if re.findall(
                            root_selector["regex"], i) else ""
                        for i in attribute
                    ]
                result[root_selector['name']] = attribute if root_selector[
                    "multiple"] or not attribute else attribute[0]
            # html类型选择器
            elif root_selector["type"] == "SelectorHTML":
                html = self.html_resolve(root_selector, response)
                # 非多元素
                if html and not root_selector["multiple"]:
                    html = [html[0]]
                # 正则过滤
                if 'regex' in root_selector and root_selector["regex"]:
                    html = [
                        re.findall(root_selector["regex"], i)[0] if re.findall(
                            root_selector["regex"], i) else "" for i in html
                    ]
                result[root_selector['name']] = html if root_selector[
                    "multiple"] or not html else html[0]
            # link类型选择器
            elif root_selector["type"] == "SelectorLink":
                if 'first' not in response.meta:
                    link = self.link_resolve(root_selector, response)

        # element类型选择器
        element_selectors = [
            i for i in self.selectors if i["parentSelectors"] == "_root"
            and i["type"] == "SelectorElement"
        ]
        # detail类型选择器
        detail_selectors = [
            i for i in self.selectors if i["parentSelectors"] == "_root"
            and i["type"] == "SelectorDetail"
        ]
        if element_selectors:
            yield from self.element_parse(element_selectors[0], response,
                                          result, response.url)

        elif detail_selectors:
            detail_urls = self.detail_resolve(detail_selectors[0], response,
                                              response.url)
            child_selectors = [
                i for i in self.selectors
                if i["parentSelectors"] == detail_selectors[0]['name']
            ]
            if detail_urls and child_selectors:
                yield from self.detail_parse(detail_selectors[0], detail_urls,
                                             child_selectors, result)
            else:
                yield result
        else:
            yield result

        # 翻页
        print('--------------------------')
        print('next page', link)
        if link:
            if isinstance(link, tuple):
                pagerange, regex = link[0], link[1]
                try:
                    args = eval(pagerange)
                except Exception as e:
                    args = 0
                if not isinstance(args, tuple):
                    args = (0, args)
                for i in range(*args):
                    if not self.dynamic:
                        if self.method.lower() == 'post':
                            request = FormRequest(url=re.sub(
                                regex, str(i), response.url),
                                                  formdata=self.form,
                                                  headers=self.headers,
                                                  cookies=self.cookies,
                                                  meta={'first': False},
                                                  callback=self.parse)
                        else:
                            request = Request(url=re.sub(
                                regex, str(i), response.url),
                                              headers=self.headers,
                                              cookies=self.cookies,
                                              meta={'first': False})
                    else:
                        if self.method.lower() == 'post':
                            request = SplashFormRequest(url=re.sub(
                                regex, str(i), response.url),
                                                        formdata=self.form,
                                                        meta={'first': False},
                                                        args=self.args_data,
                                                        callback=self.parse)
                        else:
                            request = SplashRequest(url=re.sub(
                                regex, str(i), response.url),
                                                    meta={'first': False},
                                                    callback=self.parse,
                                                    endpoint='execute',
                                                    args=self.args_data)
                    yield request
            if isinstance(link, str):
                print('下一页', link)
                if not self.dynamic:
                    if self.method.lower() == 'post':
                        request = FormRequest(url=link,
                                              formdata=self.form,
                                              headers=self.headers,
                                              cookies=self.cookies,
                                              callback=self.parse,
                                              dont_filter=True)
                    else:
                        request = Request(url=link,
                                          headers=self.headers,
                                          cookies=self.cookies,
                                          callback=self.parse)
                else:
                    if self.method.lower() == 'post':
                        request = SplashFormRequest(url=link,
                                                    formdata=self.form,
                                                    callback=self.parse,
                                                    dont_filter=True,
                                                    args=self.args_data)
                    else:
                        request = SplashRequest(url=link,
                                                endpoint='execute',
                                                callback=self.parse,
                                                args=self.args_data)
                yield request
예제 #26
0
    def parse(self, response):
        #专利名称
        title = response.xpath("//h1")
        #申请公布号
        openNo = response.xpath(
            '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[1]')
        #申请公布日
        openDate = response.xpath(
            '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[2]')
        #申请号
        applyNo = response.xpath(
            '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[3]')
        #申请日
        applyDate = response.xpath(
            '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[4]')
        #申请人
        applyPeople = response.xpath(
            '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[5]')
        #发明人
        inventor = response.xpath(
            '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[6]')
        #地址
        address = response.xpath(
            '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[8]')
        #分类号
        classifyNo = response.xpath(
            '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[9]')
        #摘要
        summery = response.xpath(
            '//div[@class="cp_box"]/div/div[@class="cp_jsh"]')
        #二维码
        qrcodeurls = response.xpath('//div[@class="cp_box"]/a/img/@src')
        #缩略图
        thumb = response.xpath(
            '//div[@class="cp_box"]/div[@class="cp_img"]/img/@src')

        baseUrl = 'http://epub.sipo.gov.cn/'
        for ti, on, od, an, ad, ap, inv, add, cf, su, qr, th in zip(
                title, openNo, openDate, applyNo, applyDate, applyPeople,
                inventor, address, classifyNo, summery, qrcodeurls, thumb):
            item = ZlItem()
            item['title'] = ti.xpath(
                "string(.)").extract_first().strip().split()[1]
            item['openNo'] = on.xpath("string(.)").extract_first()[6:]
            item['openDate'] = od.xpath("string(.)").extract_first()[6:]
            item['applyNo'] = an.xpath("string(.)").extract_first()[5:]
            item['applyDate'] = ad.xpath("string(.)").extract_first()[4:]
            item['applyPeople'] = ap.xpath("string(.)").extract_first()[4:]
            item['inventor'] = ''.join(
                inv.xpath("string(.)").extract_first()[4:].strip().split())
            item['address'] = add.xpath("string(.)").extract_first()[3:]
            item['classifyNo'] = cf.xpath(
                "string(.)").extract_first().split()[0][4:]
            item['summery'] = su.xpath("string(.)").extract_first().split()[1]
            item['qrcodeurls'] = baseUrl + qr.extract()
            item['thumb'] = baseUrl + th.extract()

            yield item

            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            }
            yield SplashFormRequest(
                url='http://epub.sipo.gov.cn/pam.action',
                callback=self.downparse,
                method='POST',
                args={'wait': 5},
                formdata={
                    "strWhere":
                    "PN='" +
                    "{}".format(on.xpath("string(.)").extract_first()[6:]) +
                    "'",
                    "strSources":
                    "pip"
                },
                headers=headers)