def process_item(self, item): # strip leading zero district = str(int(item.get("id"))) image = CSS(".mem-pic a img").match_one(item).get("src") name = CSS(".mem-name a").match_one(item) district_addr, capitol_addr = self.process_addresses(item) # email, twitter, facebook are all sometimes present try: email = CSS(".mem-email a").match_one(item).text.strip() except SelectorError: email = "" try: twitter = CSS(".fa-twitter").match_one(item) twitter = twitter.getparent().get("href").split("/")[-1] except SelectorError: twitter = "" try: facebook = CSS(".fa-facebook").match_one(item) facebook = facebook.getparent().get("href").split("/")[-1] except SelectorError: facebook = "" party = self.party_mapping[district][1] p = ScrapePerson( state="ny", chamber="lower", image=image, party=party, district=district, name=name.text.strip(), email=email, ) p.add_link(url=name.get("href")) p.add_source(url=name.get("href")) if twitter: p.ids.twitter = twitter if facebook: p.ids.facebook = facebook p.district_office.address = district_addr["address"] p.district_office.voice = district_addr["phone"] or "" p.district_office.fax = district_addr["fax"] or "" p.capitol_office.address = capitol_addr["address"] p.capitol_office.voice = capitol_addr["phone"] or "" p.capitol_office.fax = capitol_addr["fax"] or "" return p
def process_item(self, item): try: title = XPath("..//preceding-sibling::h3/text()").match(item) except SelectorError: title = XPath("../../..//preceding-sibling::h3/text()").match(item) for comm_name in title: if (comm_name == "Standing Committees" or comm_name == "Appropriations Subcommittees"): name_link = CSS("a").match_one(item) name = name_link.text_content() source = name_link.get("href") if comm_name == "Standing Committees": com = ScrapeCommittee(name=name, chamber=self.chamber) else: com = ScrapeCommittee( name=name, classification="subcommittee", chamber=self.chamber, parent="Appropriations", ) return SenateCommitteeDetail(com, source=source) else: self.skip()
def process_item(self, item): try: link = CSS("a").match(item)[1] except SelectorError: self.skip() data = { "last_name": link.text_content(), "url": link.get("href"), } for key, label in self.LABELS.items(): data[key] = CSS(f"[id$={label}]").match_one( item).text_content().strip() party = {"(D)": "Democratic", "(R)": "Republican"}[data["party"]] address = "Hawaii State Capitol, Room " + data["room"] chamber = "upper" if data["chamber"] == "S" else "lower" p = ScrapePerson( name=data["first_name"] + " " + data["last_name"], state="hi", chamber=chamber, district=data["district"], given_name=data["first_name"], family_name=data["last_name"], party=party, email=data["email"], ) p.capitol_office.address = address p.capitol_office.voice = data["voice"] p.capitol_office.fax = data["fax"] p.add_source(data["url"]) p.add_link(data["url"]) return p
def process_item(self, item): # skip header rows if ( len(CSS("td").match(item)) == 1 or CSS("td").match(item)[0].get("class") == "header" ): self.skip() first_link = CSS("td a").match(item)[0] name = first_link.text_content() detail_link = first_link.get("href") district = CSS("td").match(item)[3].text_content() party_letter = CSS("td").match(item)[4].text_content() party_dict = {"D": "Democratic", "R": "Republican", "I": "Independent"} party = party_dict[party_letter] p = ScrapePerson( name=name, state="il", party=party, chamber=self.chamber, district=district, ) p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") return LegDetail(p, source=detail_link)
def process_item(self, item): com_link = CSS("a").match(item)[0] name = com_link.text_content() com = ScrapeCommittee(name=name, classification="committee", chamber=self.chamber) detail_link = com_link.get("href") com.add_source(detail_link) com.add_link(detail_link, note="homepage") return CommitteeDetail(com, source=detail_link)
def process_page(self): p = self.input district = CSS("div.hidden-xs.mem-info h3").match_one(self.root).text_content() title, district = re.search(r"(.+)\s\|\sDistrict\s(\d+)", district).groups() p.district = district if title != "Representative": p.extras["title"] = title assistant = CSS("div.hidden-xs.mem-info a").match(self.root)[0] assistant_name = assistant.text_content() assistant_email = assistant.get("href") assistant_email = re.search(r"mailto:(.+)", assistant_email).groups()[0] assistant_phones = ( CSS("div.hidden-xs.mem-info p.no-margin").match(self.root)[1].text_content() ) phone1, phone2 = re.search(r"Phone:\s(.+)\s\|\s(.+)", assistant_phones).groups() p.extras["assistant name"] = assistant_name p.extras["assistant email"] = assistant_email p.extras["assistant phone1"] = phone1 p.extras["assistant phone2"] = phone2 press_name = ( CSS("div.hidden-xs.mem-info div.small-block.last p") .match(self.root)[0] .text_content() ) press_phone = ( CSS("div.hidden-xs.mem-info div.small-block.last p") .match(self.root)[1] .text_content() ) press_phone = re.search(r"Phone:\s(.+)", press_phone).groups()[0] press_email = ( CSS("div.hidden-xs.mem-info div.small-block.last a") .match_one(self.root) .text_content() ) p.extras["press contact name"] = press_name p.extras["press contact phone"] = press_phone p.extras["press contact email"] = press_email return p
def process_item(self, item): com_link = CSS("a").match_one(item) name = com_link.text_content() com = ScrapeCommittee( name=name, chamber=self.chamber, ) detail_link = com_link.get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") # this link has broken html (not able to grab member info) # just returning name, chamber, and link if detail_link == "https://legislature.idaho.gov/sessioninfo/2021/joint/cec/": return com return DetailCommitteePage(com, source=detail_link)