def process_item(self, item): website, district, name, party, office, phone, email = item.getchildren( ) # skip header row if website.tag == "th": self.skip() office = office.text_content() for abbr, full in self.office_names.items(): office = office.replace(abbr, full) p = ScrapePerson( name=name.text_content(), state="mi", chamber="lower", district=district.text_content().lstrip("0"), party=party.text_content(), email=email.text_content(), ) link = CSS("a").match_one(website).get("href") if link.startswith("http:/r"): link = link.replace(":/", "://") p.add_link(link) p.add_source(self.source.url) p.capitol_office.voice = phone.text_content() p.capitol_office.address = office return p
def process_item(self, item): name = CSS("td a").match(item)[1].text_content().strip() if name == "Vacant": self.skip() party = CSS("td").match(item)[1].text_content().strip() if party == "Democrat": party = "Democratic" district = CSS("td").match(item)[2].text_content().strip().lstrip("0") p = ScrapePerson( name=name, state="wv", chamber=self.chamber, district=district, party=party, ) p.add_source(self.source.url) email = CSS("td").match(item)[4].text_content().strip() p.email = email capp_addr_txt = XPath("td[4]/text()").match(item) capp_addr = "" for line in capp_addr_txt: capp_addr += line.strip() capp_addr += " " p.capitol_office.address = capp_addr.strip() phone = CSS("td").match(item)[5].text_content().strip() p.capitol_office.voice = phone detail_link = CSS("td a").match(item)[1].get("href") detail_link = detail_link.replace(" ", "%20") p.add_source(detail_link) p.add_link(detail_link, note="homepage") return LegDetail(p, source=URL(detail_link, timeout=30))
def process_page(self): party = CSS("i").match(self.root)[0].text_content().strip() # Checking for if this rep has a specific position position = "" try: position = CSS("i font").match_one( self.root).text_content().strip() party = party.replace(position, "") except SelectorError: pass if re.search("(D)", party): party = "Democrat" elif re.search("(R)", party): party = "Republican" else: self.warn(f"the party {party} must be included") phone_numbers = XPath("//font[@size='2']").match( self.root)[10].text_content() district_office = CSS("p").match(self.root)[13].getchildren() image = (XPath("//img[contains(@src, 'memberphotos')]").match_one( self.root).get("src")) district = CSS("font b").match( self.root)[26].text_content().split(" ")[1] # All emails should still be [email protected] and [email protected] - # many reps have these emails on their personal pages fullname_email = self.input.name.split("\xa0") lastname_email = fullname_email[len(fullname_email) - 1] if self.input.chamber == "upper": email = f"Sen{lastname_email}@njleg.org" elif self.input.chamber == "lower": email = f"Asm{lastname_email}@njleg.org" p = ScrapePerson( name=self.input.name, state="nj", chamber=self.input.chamber, party=party, image=image, district=district, email=email, ) p.add_source(self.input.url) p.add_source(self.source.url) if position != "": p.extras["role"] = position.replace("(", "").replace(")", "").strip() try: fax_match = phone_fax_pattern.findall( XPath("//font[@size='2']").match(self.root)[12].text_content()) for okay in district_office: address = okay.text_content() if fax_match: process_address(address, phone_numbers, p, fax_number=fax_match) else: process_address(address, phone_numbers, p) except SelectorError: pass return p