def parse_depute_page_leg13(self, response): lxs = LxmlSelector(response) yield DeputyItem( uuid=urlsplit(response.url)[2].split("/")[-1].split(".")[0], name=clean_name(lxs.css(".deputy-headline-title").text().extract()[0]), image=urljoin(response.url, lxs.css(".deputy-profile-picture")[0].attrib("src").extract()), url=response.url, jurisdiction=lxs.css(".deputy-healine-sub-title").text().extract()[0], party=lxs.css(".political-party").text().extract()[0], )
def parse_vote_page(self, response): lxs = LxmlSelector(response) item = response.meta["item"] etree.strip_tags(lxs.xmlNode, "b", "font", "i", "sup") meta = self.meta_as_dict(lxs) date_txt = lxs.xpath("//text()").re(r"[DUdu\s:]+(\d+/\d+/\d+)") if date_txt: item["date"] = datetime.strptime(date_txt[0], "%d/%m/%Y").isoformat() else: page_text = "".join(lxs.xpath("//text()").extract()) page_text = page_text.replace(u"\u00A0", " ") page_text = page_text.encode("utf-8") date_txt = re.search(r"du[:\s]+(\d+)[er]*\s+(.+?)\s+(\d+)", page_text) if date_txt: date_txt = " ".join(date_txt.groups()) item["date"] = datetime.strptime(date_txt, "%d %B %Y").isoformat() else: raise if lxs.css("#analyse p.nomgroupe"): item["votes"] = self.parse_vote_first_layout(lxs, response) else: # 2nd layout! item["votes"] = self.parse_vote_second_layout(lxs) if item.get("file_href"): yield Request( url=item["file_href"], callback=self.parse_info_page, meta={ "item": item, } ) else: yield item
def parse_deputes_page(self, response): lxs = LxmlSelector(response) leg_parsers = { 11: self.parse_depute_page_leg11, 12: self.parse_depute_page_leg12, 13: self.parse_depute_page_leg13, } for depute_node in lxs.css(".dep2"): yield Request( url=urljoin(response.url, depute_node.attrib("href").extract()), callback=leg_parsers[response.meta["leg"]], )
def parse_depute_page_leg12(self, response): lxs = LxmlSelector(response) etree.strip_tags(lxs.xmlNode, "u", "b", "font", "i", "sup") uuid = urlsplit(response.url)[2].split("/")[-1].split(".")[0] jurisdiction_line = ( lxs.xpath("//td[contains(text(), 'Circonscription ')]/following-sibling::td[1]/text()") .extract()[0] .encode("utf-8") ) jurisdiction = "%s (%s circonscription)" % re.search(r"(.*?) \((.*)\)", jurisdiction_line).groups() yield DeputyItem( uuid=uuid, name=clean_name(lxs.css(".titre").text().extract()[0]), image="http://www.assemblee-nationale.fr/12/tribun/photos/%s.jpg" % uuid, url=response.url, jurisdiction=jurisdiction, )