class ReceitaFederalSpider(scrapy.Spider): name = "ReceitaFederal" url = "http://www.receita.fazenda.gov.br/pessoajuridica/cnpj/cnpjreva/valida.asp" start_urls = ["http://www.receita.fazenda.gov.br/pessoajuridica/cnpj/cnpjreva/valida.asp"] def __init__(self, cnpj="21101794000150"): self.gs = GiantSpider() self.driver = webdriver.Firefox() self.cnpj = cnpj def parse(self, response): self.driver.get(self.url) self.fillForm() self.html = self.driver.page_source receita = self.scraping() qsaButton = self.driver.find_element_by_name("qsa") qsaButton.click() self.html = self.driver.page_source self.driver.close() self.driver.quit() receita = self.scrapingQSA(receita) self.gs.saveItem(receita, self.name) self.gs.updateFile(self.cnpj); def fillForm(self): captcha = self.gs.decodeCaptchaBypass(self.cnpj, self.driver, self.name, (182,150,363,199)) cnpjInput = self.driver.find_element_by_xpath("//*[@id='cnpj']") cnpjInput.send_keys(self.cnpj) captchaInput = self.driver.find_element_by_xpath("//*[@id='txtTexto_captcha_serpro_gov_br']") captchaInput.send_keys(captcha) continuarInput = self.driver.find_element_by_xpath("//*[@id='submit1']") continuarInput.click() error = self.driver.find_elements_by_xpath("//*[@id='theForm']/font/font/table/tbody/tr[2]/td/font/b"); if len(error) > 0: self.fillForm() def scraping(self): receita = ReceitaItem() receita['endereco'] = {} receita['contato'] = {} receita['cadastral'] = {} pre_xpath = "/html/body/table[2]/tbody/tr/td/" cnpjValido = Selector(text=self.html).xpath(pre_xpath + 'table[2]/tbody/tr/td[1]/font[2]/b[1]/text()'); if cnpjValido: cnpj = Selector(text=self.html).xpath(pre_xpath + 'table[2]/tbody/tr/td[1]/font[2]/b[1]/text()').extract()[0].strip(' \r\n\t') receita['cnpj'] = re.sub('[./-]', '', cnpj) receita['data_constituicao'] = Selector(text=self.html).xpath(pre_xpath + '/table[2]/tbody/tr/td[3]/font/b/text()').extract()[0].strip(' \r\n\t') receita['razao_social'] = Selector(text=self.html).xpath(pre_xpath + 'table[3]/tbody/tr/td/font[2]/b/text()').extract()[0].strip(' \r\n\t') receita['nome_fantasia'] = Selector(text=self.html).xpath(pre_xpath + 'table[4]/tbody/tr/td/font[2]/b/text()').extract()[0].strip(' \r\n\t') atividade_primaria = Selector(text=self.html).xpath(pre_xpath + 'table[5]/tbody/tr/td/font[2]/b').extract() #receita['atividade_economica_primaria'] = Selector(text=self.html).xpath(pre_xpath + 'table[2]/tbody/tr/td[1]/font[2]/b[1]/text()').extract()[0] #receita['atividade_economica_secundaria'] = Selector(text=self.html).xpath(pre_xpath + 'table[2]/tbody/tr/td[1]/font[2]/b[1]/text()').extract()[0] receita['natureza_juridica'] = Selector(text=self.html).xpath(pre_xpath + 'table[7]/tbody/tr/td/font[2]/b/text()').extract()[0].strip(' \r\n\t') #Preencher o Endereco receita['endereco']['logradouro'] = Selector(text=self.html).xpath(pre_xpath + 'table[8]/tbody/tr/td[1]/font[2]/b/text()').extract()[0].strip(' \r\n\t') receita["endereco"]["numero"] = Selector(text=self.html).xpath(pre_xpath + 'table[8]/tbody/tr/td[3]/font[2]/b/text()').extract()[0].strip(' \r\n\t') receita["endereco"]["complemento"] = Selector(text=self.html).xpath(pre_xpath + 'table[8]/tbody/tr/td[5]/font[2]/b/text()').extract()[0].strip(' \r\n\t') receita["endereco"]["bairro"] = Selector(text=self.html).xpath(pre_xpath + 'table[9]/tbody/tr/td[3]/font[2]/b/text()').extract()[0].strip(' \r\n\t') receita["endereco"]["cidade"] = Selector(text=self.html).xpath(pre_xpath + 'table[9]/tbody/tr/td[5]/font[2]/b/text()').extract()[0].strip(' \r\n\t') receita["endereco"]["uf"] = Selector(text=self.html).xpath(pre_xpath + 'table[9]/tbody/tr/td[7]/font[2]/b/text()').extract()[0].strip(' \r\n\t') receita["endereco"]["cep"] = Selector(text=self.html).xpath(pre_xpath + 'table[9]/tbody/tr/td[1]/font[2]/b/text()').extract()[0].strip(' \r\n\t') #Preencher o Contato receita['contato']['email'] = Selector(text=self.html).xpath(pre_xpath + 'table[10]/tbody/tr/td[1]/font[2]/b/text()').extract()[0].strip(' \r\n\t') receita['contato']['telefone'] = Selector(text=self.html).xpath(pre_xpath + 'table[10]/tbody/tr/td[1]/font[2]/b/text()').extract()[0].strip(' \r\n\t') receita['contato']['ente_federativo_responsavel'] = Selector(text=self.html).xpath(pre_xpath + 'table[11]/tbody/tr/td/font[2]/b/text()').extract()[0].strip(' \r\n\t') #Preencher os dados Cadastrais receita['cadastral']['situacao'] = Selector(text=self.html).xpath(pre_xpath + 'table[12]/tbody/tr/td[1]/font[2]/b/text()').extract()[0].strip(' \r\n\t') receita['cadastral']['data'] = Selector(text=self.html).xpath(pre_xpath + 'table[12]/tbody/tr/td[3]/font[2]/b/text()').extract()[0].strip(' \r\n\t') receita['cadastral']['motivo'] = Selector(text=self.html).xpath(pre_xpath + 'table[13]/tbody/tr/td/font[2]/b/text()').extract()[0].strip(' \r\n\t') receita['cadastral']['situacao_especial'] = Selector(text=self.html).xpath(pre_xpath + 'table[14]/tbody/tr/td[1]/font[2]/b/text()').extract()[0].strip(' \r\n\t') receita['cadastral']['data_especial'] = Selector(text=self.html).xpath(pre_xpath + 'table[14]/tbody/tr/td[3]/font[2]/b/text()').extract()[0].strip(' \r\n\t') #Preencher evidencia do cartao CNPJ receita["html_cartao_cnpj"] = self.html return receita def scrapingQSA(self, receita): #Preencher QSA receita['qsa'] = {} capital_social = Selector(text=self.html).xpath('/html/body/table[2]/tbody/tr/td/table/tbody/tr[3]/td[2]/text()') if capital_social: receita['qsa']['capital_social'] = capital_social.extract()[0].strip(' \r\n\t') else: receita['qsa']['capital_social'] = "NAO PREENCHIDO" qsa = Selector(text=self.html).xpath('/html/body/table[3]/tbody/tr/td/table[3]/tbody/tr') if qsa: receita['qsa']['quadro_social'] = [] quadros = Selector(text=self.html).xpath('/html/body/table[3]/tbody/tr/td/table[3]/tbody/tr') for k in range(1, (len(quadros))): tmpQuadro = {} nome_empresarial = Selector(text=self.html).xpath('/html/body/table[3]/tbody/tr/td/table[3]/tbody/tr['+str(k)+']/td/fieldset/table/tbody/tr/td[1]/table/tbody/tr[1]/td[2]/text()').extract() qualificacao = Selector(text=self.html).xpath('/html/body/table[3]/tbody/tr/td/table[3]/tbody/tr['+str(k)+']/td/fieldset/table/tbody/tr/td[1]/table/tbody/tr[2]/td[2]/text()').extract() if len(nome_empresarial) > 0: tmpQuadro["nome_empresarial"] = nome_empresarial[0].strip(' \r\n\t') if len(qualificacao) > 0: tmpQuadro["qualificacao"] = qualificacao[0].strip(' \r\n\t') receita['qsa']['quadro_social'].append(tmpQuadro) else: receita['qsa']['quadro_social'] = "A NATUREZA JURIDICA NAO PERMITE O PREENCHIMENTO DO QSA"; return receita