def parse(self, response): hxs = HtmlXPathSelector(response) senadores = hxs.select('//table[@bordercolor="#ece8e1"]/tbody/tr[position()>1]') for sen in senadores: l = LegisladorItemLoader(item=LegisladorItem(), selector=sen) apellido, nombre = sen.select('td[2]/text()').re(r'(.*?),(.*)') l.add_value('apellido', apellido) l.add_value('nombre', nombre) l.add_xpath('distrito_nombre', 'text()[1]') l.add_xpath('partido_nombre', 'text()[2]') l.add_value('camara', 'S') l.add_value('resource_source', u"http://www.senado.gov.ar/web/senadores/senadores.php") resource_url = urljoin(response.url, sen.select('td[2]/@onclick').re(r"^location\.href = '(.*?)'")[0]) l.add_value('resource_url', resource_url) resource_id = parse_qs(urlparse(resource_url).query)['id_sena'][0] l.add_value('resource_id', resource_id) l.add_value('id', 'senador:%s' % resource_id) yield Request(resource_url, callback=partial(self._parse_senador_bio_page, l=l))
def parse_diputado(self, response, item_data): hxs = HtmlXPathSelector(response) #uname = urlsplit(response.url).path.split('/')[-1] l = LegisladorItemLoader(selector=hxs.select("/html/body/div/div[3]/table/tbody")) l.add_value('id', item_data['resource_url']) # unique enough :) l.add_value('resource_source', u'webappl.hcdn.gov.ar') l.add_value('resource_id', item_data['resource_id']) l.add_value('resource_url', item_data['resource_url']) l.add_value('camara', 'D') l.add_value('nombre', item_data['nombre']) l.add_value('apellido', item_data['apellido']) l.add_xpath('foto_url', 'tr[1]/td[2]/img/@src') l.add_xpath('bloque_nombre', 'tr[3]/td[2]/text()') l.add_xpath('distrito_nombre', 'tr[4]/td[1]/text()', re='Distrito:\xa0 ([a-zA-Z0-9 ]+)') l.add_xpath('mandato_inicio', 'tr[4]/td[2]/text()', re='(\d\d/\d\d/\d\d\d\d)') l.add_xpath('mandato_fin', 'tr[4]/td[2]/text()', re='(\d\d/\d\d/\d\d\d\d)$') l.add_xpath('email', 'tr[5]/td[2]/a/text()') l.add_xpath('telefono', 'tr[5]/td[1]/text()', re='([-\d]+)') yield l.load_item()
def parse_diputado(self, response, item_data): hxs = HtmlXPathSelector(response) #uname = urlsplit(response.url).path.split('/')[-1] l = LegisladorItemLoader(selector=hxs.select('//div[@class="diputados-principal"]')) l.add_value('id', item_data['resource_url']) # unique enough :) l.add_value('resource_source', u'www.hcdn.gov.ar') l.add_value('resource_id', item_data['resource_id']) l.add_value('resource_url', item_data['resource_url']) l.add_value('camara', 'D') l.add_value('nombre', item_data['nombre']) l.add_value('apellido', item_data['apellido']) l.add_xpath('foto_url', './/div[@class="foto-diputados-principal"]/img/@src') l.add_xpath('distrito_nombre', './/div[@class="info-diputados-principal1"]//text()[2]', re='Distrito:\xa0 ([a-zA-Z0-9 ]+)') l.add_xpath('telefono', './/div[@class="info-diputados-principal1"]//text()[3]', re='([-\d]+)') l.add_xpath('bloque_nombre', './/div[@class="info-diputados-principal2"]//h3/text()') l.add_xpath('mandato_inicio', './/div[@class="info-diputados-principal2"]//text()[2]', re=r'.*(\d\d/\d\d/\d\d\d\d)\xa0-.*') l.add_xpath('mandato_fin', './/div[@class="info-diputados-principal2"]//text()[2]', re=r'.*-\xa0(\d\d/\d\d/\d\d\d\d).*') l.add_xpath('email', './/div[@class="info-diputados-principal2"]//a/text()') yield l.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) senadores = hxs.select( '//table[@bordercolor="#ece8e1"]/tbody/tr[position()>1]') for sen in senadores: l = LegisladorItemLoader(item=LegisladorItem(), selector=sen) apellido, nombre = sen.select('td[2]/text()').re(r'(.*?),(.*)') l.add_value('apellido', apellido) l.add_value('nombre', nombre) l.add_xpath('distrito_nombre', 'text()[1]') l.add_xpath('partido_nombre', 'text()[2]') l.add_value('camara', 'S') l.add_value( 'resource_source', u"http://www.senado.gov.ar/web/senadores/senadores.php") resource_url = urljoin( response.url, sen.select('td[2]/@onclick').re(r"^location\.href = '(.*?)'") [0]) l.add_value('resource_url', resource_url) resource_id = parse_qs(urlparse(resource_url).query)['id_sena'][0] l.add_value('resource_id', resource_id) l.add_value('id', 'senador:%s' % resource_id) yield Request(resource_url, callback=partial(self._parse_senador_bio_page, l=l))
def parse_diputado(self, response, item_data): hxs = HtmlXPathSelector(response) #uname = urlsplit(response.url).path.split('/')[-1] l = LegisladorItemLoader( selector=hxs.select('//div[@class="diputados-principal"]')) l.add_value('id', item_data['resource_url']) # unique enough :) l.add_value('resource_source', u'www.hcdn.gov.ar') l.add_value('resource_id', item_data['resource_id']) l.add_value('resource_url', item_data['resource_url']) l.add_value('camara', 'D') l.add_value('nombre', item_data['nombre']) l.add_value('apellido', item_data['apellido']) l.add_xpath('foto_url', './/div[@class="foto-diputados-principal"]/img/@src') l.add_xpath('distrito_nombre', './/div[@class="info-diputados-principal1"]//text()[2]', re='Distrito:\xa0 ([a-zA-Z0-9 ]+)') l.add_xpath('telefono', './/div[@class="info-diputados-principal1"]//text()[3]', re='([-\d]+)') l.add_xpath('bloque_nombre', './/div[@class="info-diputados-principal2"]//h3/text()') l.add_xpath('mandato_inicio', './/div[@class="info-diputados-principal2"]//text()[2]', re=r'.*(\d\d/\d\d/\d\d\d\d)\xa0-.*') l.add_xpath('mandato_fin', './/div[@class="info-diputados-principal2"]//text()[2]', re=r'.*-\xa0(\d\d/\d\d/\d\d\d\d).*') l.add_xpath('email', './/div[@class="info-diputados-principal2"]//a/text()') yield l.load_item()