class SenateCommitteeList(HtmlListPage): source = URL("http://senate.ca.gov/committees") selector = XPath("//h2/../following-sibling::div//a") def process_item(self, item): comm_name = XPath("text()").match_one(item) if comm_name in [ "Teleconference How-To Information", "Legislative Process" ]: self.skip() comm_url = XPath("@href").match_one(item) if comm_name.startswith("Joint"): com = ScrapeCommittee(name=comm_name, classification="committee", chamber="legislature") elif comm_name.startswith("Subcommittee"): parent_comm = (item.getparent().getparent().getparent().getparent( ).getchildren()[0].text_content()) com = ScrapeCommittee( name=comm_name, classification="subcommittee", chamber="upper", parent=parent_comm, ) else: com = ScrapeCommittee(name=comm_name, classification="committee", chamber="upper") com.add_source(self.source.url) com.add_source(comm_url) com.add_link(comm_url, note="homepage") return ChooseType(com, source=URL(comm_url))
class House(HtmlListPage): source = URL( "http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara/Biografia.aspx" ) selector = CSS("ul.list-article li", num_items=49) def process_item(self, item): bio_info = (CSS("div.biodiv a").match_one( item).text_content().strip().split("\n")) name = bio_info[0].strip() name = re.sub(r"^Hon\.", "", name, flags=re.IGNORECASE).strip() district = bio_info[2].strip() if district == "Representante por Acumulación": district = "At-Large" else: district = re.search(r"Representante\sdel\sDistrito\s(.+)", district).groups()[0] partial = PartialRep(name=name, district=district, source=self.source.url) detail_link = CSS("a").match_one(item).get("href") return RepDetail(partial, source=detail_link)
def process_page(self): for item in self.data["Data"]: name = item["PersonFullName"] party_code = item["PartyCode"] party_dict = { "D": "Democratic", "R": "Republican", "I": "Independent" } party = party_dict[party_code] district = item["DistrictNumber"] p = ScrapePerson( name=name, state="de", party=party, chamber=self.chamber, district=district, ) p.add_source(self.source.url) detail_link = URL( f"https://legis.delaware.gov/LegislatorDetail?personId={item['PersonId']}" ) p.add_source(detail_link.url) p.add_link(detail_link.url, note="homepage") yield LegDetail(p, source=detail_link.url)
def process_item(self, item): name = CSS("a.membername").match_one(item).text_content() name = re.search(r"(Senator|Representative)\s(.+)", name).groups()[1] party = CSS("a.membername").match_one(item).tail.strip() if party == "(D)": party = "Democratic" elif party == "(R)": party = "Republican" district = CSS("div.district a").match_one(item).text_content().strip() district = re.search(r"District\s(.+)", district).groups()[0] p = ScrapePerson( name=name, state="sc", chamber=self.chamber, district=district, party=party, ) detail_link = CSS("div.district a").match_one(item).get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") img = CSS("img").match_one(item).get("src") p.image = img return LegDetail(p, source=URL(detail_link, timeout=20))
class EmailAugmentation(HtmlListPage): """ WA Email addresses are listed on a separate page. """ source = URL("https://app.leg.wa.gov/memberemail/Default.aspx") def find_rows(self): return CSS("#membertable tbody tr").match(self.root, num_items=147) def process_page(self): # We need it to find the member's email address. # These positions are enough to discriminate the chamber too (0 = upper, 1,2 = lower) mapping = {} rows = self.find_rows() for row in rows: tds = row.getchildren() name = CSS("a").match_one(tds[0]).text_content().strip() name = re.sub(r"^(Rep\.\s|Senator\s)", "", name) email = tds[1].text_content().strip() dist = tds[2].text_content().strip() position = tds[3].text_content().strip() party = tds[4].text_content().strip() mapping[name] = (email, party, dist, position) return mapping
class PartyAugmentation(HtmlPage): """ NY Assembly does not have partisan information on their site. In the past we scraped NYBOE, but that broke. This is our best option besides hard-coding... and it isn't good. """ source = URL("https://en.wikipedia.org/wiki/New_York_State_Assembly") def find_rows(self): # the first table on the page that has a bunch of rows for table in CSS("table.wikitable").match(self.root): rows = CSS("tr").match(table) if len(rows) >= 150: return rows def process_page(self): mapping = {} rows = self.find_rows() for row in rows: tds = row.getchildren() dist = tds[0].text_content().strip() name = tds[1].text_content().strip() party = tds[2].text_content().strip() if "[" in party: party = party.split("[")[0] mapping[dist] = (name, party) return mapping
class SenList(HtmlListPage): source = URL("https://senate.arkansas.gov/senators/committees/") selector = CSS("ins > ul > li", num_items=45) def process_item(self, item): comm_name = CSS("a").match(item)[0].text_content().strip() previous_sibs = item.getparent().itersiblings(preceding=True) for sib in previous_sibs: if len(sib.getchildren()) == 0: chamber_type = sib.text_content().strip() break if chamber_type == "Senate Committees": chamber = "upper" elif chamber_type == "Joint Committees": self.skip() elif chamber_type == "Task Forces": self.skip() com = ScrapeCommittee( name=comm_name, classification="committee", chamber=chamber, ) detail_link = CSS("a").match(item)[0].get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") return SenDetail(com, source=detail_link)
class JointSubComms(HtmlListPage): source = URL("https://www.arkleg.state.ar.us/Committees/List?type=Joint") selector = CSS("div#bodyContent li a", num_items=31) def process_item(self, item): sub_name = item.text_content().strip() parent = (item.getparent().getparent().getparent().getparent(). getchildren()[0].text_content().strip()) if parent.title() == "Alc-Jbc Budget Hearings": self.skip() com = ScrapeCommittee( name=sub_name.title(), classification="subcommittee", chamber="legislature", parent=parent.title(), ) detail_link = item.get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") return HouseJointDetail(com, source=detail_link)
class CouncilList(HtmlListPage): source = URL("http://dccouncil.us/councilmembers/") selector = CSS("li.column", num_items=14) def process_item(self, item): try: title = CSS("h3").match_one(item).text_content() if title == "Chair Pro Tempore": # this member is listed twice. skip the 1st time self.skip() except SelectorError: title = None img = CSS("img").match_one(item).get("src") detail_link = CSS("a").match(item)[1].get("href") partial_p = PartialPerson( state="dc", chamber="legislature", image=img, source1=self.source.url, source2=detail_link, link=detail_link, ) return CouncilDetail(partial_p, source=detail_link)
class Representatives(HtmlListPage): # note: there is a CSV, but it requires a bunch of ASP.net hoops to actually get source = URL( "https://house.mo.gov/MemberGridCluster.aspx?year=2021&code=R+&filter=clear" ) selector = CSS("tr") def process_item(self, item): tds = CSS("td").match(item, min_items=0, max_items=8) if not tds: self.skip() _, last, first, district, party, town, phone, room = tds if last.text_content() == "Vacant": self.skip() return HouseDetail( HousePartial( last_name=last.text_content(), first_name=first.text_content(), district=int(district.text_content()), party=party.text_content(), hometown=town.text_content().strip(), voice=phone.text_content(), room=room.text_content(), url=CSS("a").match_one(last).get("href"), ) )
class RepublicanHouse(RedRepList): source = URL( "https://www.indianahouserepublicans.com/members/?pos=0,100,100", timeout=10) selector = CSS("div.member-list a", min_items=60, max_items=100) chamber = "lower" party = "Republican"
def process_item(self, item): comm_name = CSS("a").match_one(item).text_content() comm_url = CSS("a").match_one(item).get("href") # "https://jtlegbudget.legislature.ca.gov/sublegislativeanalyst" has no members if comm_url == "https://jtlegbudget.legislature.ca.gov/sublegislativeanalyst": self.skip() # Joint Committees are being skipped to avoid duplicates (they were already grabbed during SenateCommitteeList()) if comm_name.startswith("Joint Committee") or comm_name.startswith( "Joint Legislative"): self.skip() elif comm_name.startswith("Subcommittee"): parent_comm = item.getparent().getparent().getchildren( )[0].text_content() com = ScrapeCommittee( name=comm_name, classification="subcommittee", chamber="lower", parent=parent_comm, ) else: com = ScrapeCommittee(name=comm_name, classification="committee", chamber="lower") com.add_source(self.source.url) com.add_source(comm_url) com.add_link(comm_url, note="homepage") return ChooseType(com, source=URL(comm_url))
def process_item(self, item): name_dirty = CSS("a").match_one(item).text_content().strip().split( ", ") name = name_dirty[1] + " " + name_dirty[0] district = CSS("br").match(item)[-1].tail.strip() district = re.search(r"District\s(.+)", district).groups()[0] party = CSS("b").match_one(item).tail.strip() if party == "(D)": party = "Democratic" elif party == "(R)": party = "Republican" elif party == "(I)": party = "Independent" p = ScrapePerson( name=name, state="pa", chamber=self.chamber, district=district, party=party, ) detail_link = CSS("a").match_one(item).get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") return LegDetail(p, source=URL(detail_link, timeout=10))
def process_item(self, item): ctype = item.text_content() if ctype == "Standing" or ctype == "Statutory": return SenateTypeCommitteeList( source=URL(item.get("href"), timeout=30)) else: self.skip()
def process_item(self, item): comm_name = XPath("text()").match_one(item) if comm_name in [ "Teleconference How-To Information", "Legislative Process" ]: self.skip() comm_url = XPath("@href").match_one(item) if comm_name.startswith("Joint"): com = ScrapeCommittee(name=comm_name, classification="committee", chamber="legislature") elif comm_name.startswith("Subcommittee"): parent_comm = (item.getparent().getparent().getparent().getparent( ).getchildren()[0].text_content()) com = ScrapeCommittee( name=comm_name, classification="subcommittee", chamber="upper", parent=parent_comm, ) else: com = ScrapeCommittee(name=comm_name, classification="committee", chamber="upper") com.add_source(self.source.url) com.add_source(comm_url) com.add_link(comm_url, note="homepage") return ChooseType(com, source=URL(comm_url))
class CommitteeList(JsonListPage): source = URL( "https://www.legis.ga.gov/api/committees/List/1029", headers={"Authorization": get_token()}, ) def process_item(self, item): if item["chamber"] == 2: chamber = "upper" elif item["chamber"] == 1: chamber = "lower" source = URL( f"https://www.legis.ga.gov/api/committees/details/{item['id']}/1029", headers={"Authorization": get_token()}, ) com = ScrapeCommittee( name=item["name"], chamber=chamber, ) com.add_source( self.source.url, note="Initial list page (requires authorization token)" ) return CommitteeDetail( com, source=source, )
def process_item(self, item): committee_name = item.text_content() # only scrape joint coms on senate scrape if ("Joint" in committee_name or "Task Force" in committee_name or "Conference" in committee_name): self.skip() committee_name = remove_comm(committee_name) committee_name = committee_name.strip() if "Subcommittee" in committee_name: name = committee_name.replace("Subcommittee on ", "").replace(", Subcommittee", "") parent = remove_comm( XPath("..//..//preceding-sibling::a").match(item) [0].text_content()) com = ScrapeCommittee( name=name, chamber=self.chamber, classification="subcommittee", parent=parent, ) else: com = ScrapeCommittee(name=committee_name, chamber=self.chamber) # We can construct a URL that would make scraping easier, as opposed to the link that is directly given comm_link = item.get("href").replace("https://www.house.mo.gov/", "") source = f"https://www.house.mo.gov/MemberGridCluster.aspx?filter=compage&category=committee&{comm_link}" return HouseCommitteeDetail(com, source=URL(source, timeout=30))
class DemocraticSenate(BlueSenList): source = URL("https://www.indianasenatedemocrats.org/senators/") selector = CSS( "article ul li", num_items=11, ) chamber = "upper" party = "Democratic"
def test_html_page(): p = HtmlPage(source=URL(SOURCE)) p.response = Response(b"<html><a href='/test'>link</a></html>") p.postprocess_response() # test existence of page.root link = p.root.xpath("//a")[0] # test that links were normalized to example.com assert link.get("href") == "https://example.com/test"
def process_item(self, item): chamber_id = item["district"]["chamberType"] p = ScrapePerson( state="ga", chamber=self.chamber_types[chamber_id], district=str(item["district"]["number"]), name=item["fullName"], family_name=item["name"]["familyName"], given_name=item["name"]["first"], suffix=item["name"]["suffix"] or "", party=self.party_ids[item["party"]], ) # district address da = item["districtAddress"] if da["email"]: p.email = da["email"] if da["phone"]: p.district_office.voice = da["phone"] if da["fax"]: p.district_office.fax = da["fax"] if da["address1"]: p.district_office.address = da["address1"] if da["address2"]: p.district_office.address += "; " + da["address2"] p.district_office.address += "; {city}, {state} {zip}".format(**da) p.district_office.address = p.district_office.address.strip() # photos if not item["photos"]: pass elif len(item["photos"]) == 1: p.image = item["photos"][0]["url"].split("?")[ 0] # strip off ?size=mpSm for full size else: raise Exception("unknown photos configuration: " + str(item["photos"])) # extras p.extras["residence"] = item["residence"] p.extras["city"] = item["city"].strip() p.extras["georgia_id"] = item["id"] url = ( f"https://www.legis.ga.gov/members/{self.chamber_names[chamber_id]}/" f"{item['id']}?session={item['sessionId']}") p.add_source(url, note="Initial list page (requires authorization token)") source = URL( f"https://www.legis.ga.gov/api/members/detail/{item['id']}?session=1029&chamber={chamber_id}", headers={"Authorization": get_token()}, ) return LegDetail(p, source=source)
def process_page(self): for member in ( self.data["content"]["house_members"] + self.data["content"]["senate_members"] ): # source is a URL object, we need the .url member yield MembersDetail( source=URL(self.source.url + member["KPID"] + "/", timeout=10) )
def process_item(self, item): com_link = CSS("a").match(item)[0] name = com_link.text_content() com = ScrapeCommittee( name=name, classification="committee", chamber=self.chamber ) detail_link = com_link.get("href") com.add_source(detail_link) com.add_link(detail_link, "homepage") return HouseCommitteeDetail(com, source=URL(detail_link, timeout=30))
def process_item(self, item): name = item.text lname = name.lower() if "resigned" in lname or "vacated" in lname or "retired" in lname: return name, action, date = clean_name(name) return self.next_page_cls( PartialMember(name=name, url=item.get("href")), source=URL(item.get("href"), timeout=10), )
class CommitteeList(HtmlListPage): source = URL("http://www.akleg.gov/basis/Committee/List/32") selector = CSS("div.area-frame ul.list li", num_items=112) def process_item(self, item): comm_name = ( item.text_content().strip().split(" (")[0].title().replace( "(Fin Sub)", "")) if "Conference" in comm_name: self.skip() chamber = item.getparent().getprevious().getprevious().text_content( ).strip() if chamber == "House": chamber = "lower" elif chamber == "Senate": chamber = "upper" elif chamber == "Joint Committee": chamber = "legislature" classification = item.getparent().getprevious().text_content().strip() if classification == "Finance Subcommittee": # work around duplicate name of Judiciary committees # a current limitation in how Open States can handle committees # see https://github.com/openstates/issues/issues/598 if comm_name == "Judiciary": comm_name = "Judiciary (Finance)" com = ScrapeCommittee( name=comm_name, classification="subcommittee", chamber=chamber, parent="Finance", ) else: com = ScrapeCommittee( name=comm_name, classification="committee", chamber=chamber, ) detail_link = CSS("a").match_one(item).get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") return CommiteeDetail(com, source=URL(detail_link, timeout=30))
class CommitteeList(HtmlListPage): source = URL("https://dccouncil.us/committees-for-council-period-23/") selector = CSS("div ul li div") chamber = "legislature" def process_item(self, item): com_link = CSS("a").match(item)[0] name = com_link.text_content() com = ScrapeCommittee(name=name, classification="committee", chamber=self.chamber) detail_link = com_link.get("href") com.add_source(detail_link) com.add_link(detail_link, note="homepage") return CommitteeDetail(com, source=detail_link)
class Legislators(JsonListPage): source = URL("https://sdlegislature.gov/api/SessionMembers/Session/44") def process_item(self, item): first = item["FirstName"] last = item["LastName"] initial = item["Initial"] if initial: # V. J. puts his initials as his first name if first == "V. J.": name = f"{first} {last}" else: name = f"{first} {initial}. {last}" else: name = f"{first} {last}" p = ScrapePerson( name=name, family_name=last, given_name=first, state="sd", district=item["District"].lstrip("0"), chamber="upper" if item["MemberType"] == "S" else "lower", party=item["Politics"], email=item["EmailState"] or "", image= "https://lawmakerdocuments.blob.core.usgovcloudapi.net/photos/" + item["Picture"].lower(), ) address = item["HomeAddress1"] if item["HomeAddress2"]: address += "; " + item["HomeAddress2"] address += f"; {item['HomeCity']}, {item['HomeState']} {item['HomeZip']}" p.district_office.address = address p.district_office.voice = item["HomePhone"] or "" p.capitol_office.voice = item["CapitolPhone"] or "" p.extras["occupation"] = item["Occupation"] url = f"https://sdlegislature.gov/Legislators/Profile/{item['SessionMemberId']}/Detail" p.add_link(url) p.add_source(url) return p
class Senate(HtmlListPage): source = URL("https://senado.pr.gov/Pages/Senadores.aspx") selector = CSS("ul.senadores-list li", num_items=27) def process_item(self, item): # Convert names to title case as they are in all-caps name = CSS("span.name").match_one(item).text_content().strip() name = re.sub(r"^Hon\.", "", name, flags=re.IGNORECASE).strip().title() party = CSS("span.partido").match_one(item).text_content().strip() # Translate to English since being an Independent is a universal construct if party == "Independiente": party = "Independent" detail_link = CSS("a").match_one(item).get("href") partial = PartialSen(name=name, party=party, source=self.source.url) return SenDetail(partial, source=detail_link)
class Legislators(XmlListPage): session_num = "32" source = URL( "http://www.legis.state.ak.us/publicservice/basis/members" f"?minifyresult=false&session={session_num}", headers={"X-Alaska-Legislature-Basis-Version": "1.4"}, ) selector = XPath("//Member/MemberDetails") def process_item(self, item): item_dict = {elem: _get_if_exists(item, elem) for elem in ELEMENTS} chamber = item.attrib["chamber"] code = item.attrib["code"].lower() party = item_dict["Party"] if party == "N": party = "Independent" person = ScrapePerson( name="{FirstName} {LastName}".format(**item_dict), given_name=item_dict["FirstName"], family_name=item_dict["LastName"], state="ak", party=party, chamber=("upper" if chamber == "S" else "lower"), district=item_dict["District"], image=f"http://akleg.gov/images/legislators/{code}.jpg", email=item_dict["EMail"], ) person.add_link( "http://www.akleg.gov/basis/Member/Detail/{}?code={}".format( self.session_num, code)) person.add_source("http://w3.akleg.gov/") if item_dict["Phone"]: phone = "907-" + item_dict["Phone"][0:3] + "-" + item_dict[ "Phone"][3:] person.capitol_office.voice = phone if item_dict["Building"] == "CAPITOL": person.capitol_office.address = ( "State Capitol Room {}; Juneau, AK, 99801".format( item_dict["Room"])) return person
class SenateCommittee(HtmlListPage): source = URL( "http://www.gencourt.state.nh.us/Senate/committees/senate_committees.aspx", timeout=30, ) chamber = "upper" selector = CSS("#form1 div h5") def process_item(self, item): com_link = CSS("a").match(item)[0] name = com_link.text_content() com = ScrapeCommittee( name=name, classification="committee", chamber=self.chamber ) detail_link = com_link.get("href") com.add_source(detail_link) com.add_link(detail_link, "homepage") return SenateCommitteeDetail(com, source=URL(detail_link, timeout=30))
class LegList(HtmlListPage): source = URL( "https://www.legis.nd.gov/assembly/67-2021/members/members-by-district" ) selector = CSS("div.view-content > div", num_items=142) def process_item(self, item): name = CSS("div.name").match_one(item).text_content().strip() name = re.search(r"(Senator|Representative)\s(.+)", name).groups()[1] # Luke Simons was expelled on 3/4/21 if name == "Luke Simons": self.skip() chamber = CSS("div.chamber").match_one(item).text_content().strip() if chamber == "Senate": chamber = "upper" elif chamber == "House": chamber = "lower" for previous_tag in item.itersiblings(preceding=True): if previous_tag.get("class") == "title": district = previous_tag.text_content().strip() district = re.search(r"District\s(.+)", district).groups()[0] break party = CSS("div.party").match_one(item).text_content().strip() if party == "Democrat": party = "Democratic" p = ScrapePerson( name=name, state="nd", chamber=chamber, district=district, party=party, ) detail_link = CSS("div.name a").match_one(item).get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") return LegDetail(p, source=detail_link)