def process_page(self): com = self.input # no members if (CSS("div.Membership fieldset").match_one( self.root).text_content().strip() == ""): raise SkipItem("empty committee") members = CSS("fieldset div.area-holder ul.list li span.col01").match( self.root) num_members = 0 for member in members: role = member.getnext().text_content().strip() # skip Public Members if role == "Public Member": continue if role == "Member": role = "member" num_members += 1 mem_name = CSS("span span").match_one( member).text_content().strip() mem_name = re.search(r"(Representative|Senator)\s(.+)", mem_name).groups()[1] com.add_member(mem_name, role) if not num_members: raise SkipItem("only public members") return com
def process_page(self): com = self.input com.add_source(self.source.url) com.add_link(self.source.url, note="homepage") members = CSS(".gallery .desc").match(self.root) if not members: raise SkipItem("empty committee") positions = ["Chairman", "Vice-Chairman"] for member in members: member_position = member.text_content().replace("Senator", "").split(", ") if (member_position[0] == "House Vacancy" or member_position[0] == "Senate Vacancy"): continue member_pos_str = "member" member_name = (member_position[0].replace("Representative ", "").replace("Rep. ", "")) for pos in positions: if pos in member_position: member_pos_str = pos com.add_member(member_name, member_pos_str) if not com.members: raise SkipItem("empty") return com
def process_page(self): com = self.input members = list(CSS("table.Grid a").match(self.root)) if not members: raise SkipItem("empty committee") for member in members: name = member.text_content().strip() if re.search(r"\(", name): name_split = re.search(r"(.+),\s(.+)\s\((.+)\)", name).groups() first_name = name_split[1] last_name = name_split[0] role = name_split[2] else: name_split = re.search(r"(.+),\s(.+)", name).groups() first_name = name_split[1] last_name = name_split[0] role = "member" first_name = re.sub(""", '"', first_name) name = f"{first_name} {last_name}" com.add_member(name, role) extra_info = CSS("div table#ContentPlaceHolder1_gvClerk tr").match( self.root) for info in extra_info: if ":" in info.text_content().strip(): idx, val = info.text_content().strip().split(":") com.extras[idx.strip()] = val.strip() else: com.extras["Room"] = info.text_content().strip() return com
def process_page(self): com = self.input try: members = XPath( "//*[@id='committeesIntroRoster']/div/div/div/a").match( self.root) for member in members: member_dirty = member.text_content().strip().split("\n") mem_name = member_dirty[0].strip( ) + " " + member_dirty[1].strip() role = (member.getparent().getprevious().getprevious(). text_content().strip()) if role.strip() == "": role = "member" com.add_member(mem_name, role) # many 'ex officio' roles for House Subcommittees, Joint Committees, and Joint Subcommittees except SelectorError: raise SkipItem("empty committee") try: extra_info = CSS("div#bodyContent b").match(self.root) for title in extra_info: position = title.text_content().strip() name = title.getnext().tail.strip() com.extras[position] = name except SelectorError: pass return com
def process_item(self, item): name_title = XPath(".//td/span/a/text()").match(item) name_dirty = name_title[0].split(", ") print(name_dirty) if name_dirty[0] == "Vacant": raise SkipItem("vacant") name = name_dirty[1] + " " + name_dirty[0] party = CSS("td a").match(item)[2].text_content().strip() district = CSS("td a").match(item)[3].text_content().strip() district = re.search(r"No\.\s(.+)", district).groups()[0] p = ScrapePerson( name=name, state="nv", chamber=self.chamber, district=district, party=party, ) if len(name_title) > 1: title = name_title[1] p.extras["title"] = title detail_link = CSS("td span a").match_one(item).get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") img = CSS("img").match_one(item).get("src") p.image = img extra_info = item.getnext() district_addr = CSS("td").match(extra_info)[0].text_content().strip() p.district_office.address = district_addr extra_info_detail = CSS("td div div div span.field").match(extra_info) email = extra_info_detail[1].text_content().strip() p.email = email leg_bldg_room = extra_info_detail[2].text_content().strip() if self.chamber == "upper": chamb = "Senate" else: chamb = "Assembly" cap_addr = f"Room {leg_bldg_room};c/o Nevada {chamb};401 South Carson Street;Carson City, NV 89701-4747" p.capitol_office.address = cap_addr leg_bldg_phone = extra_info_detail[3].text_content().strip() p.capitol_office.voice = leg_bldg_phone return p
def process_item(self, item): last_name = re.split("Pictures/|_", item.get("src"))[1] if last_name == "VACANT.jpeg": raise SkipItem("vacant") oid_person = item.get("alt") oid_sponsor = item.get("longdesc").split("House/")[1] url = f"http://www.legislature.state.al.us/aliswww/ISD/ALRepresentative.aspx?NAME={last_name}&OID_SPONSOR={oid_sponsor}&OID_PERSON={oid_person}&SESSNAME=" p = PartialMember(url=self.source.url, chamber=self.chamber) return LegDetail(p, source=url)
def process_page(self): com = self.input com.add_source(self.source.url) com.add_link(self.source.url, note="homepage") try: chairs = CSS(".chair-info").match(self.root) except SelectorError: raise SkipItem("skipping committee without full information") # in case there are co-chairs num_chairs = len(chairs) for chair in chairs: chair_name = CSS(".comm-chair-name").match_one(chair).text_content().strip() chair_role = ( XPath(f"..//preceding-sibling::header[{num_chairs}]") .match_one(chair) .text_content() .strip() .lower() ) com.add_member(chair_name, chair_role) # some committees only have chairs and no members list try: for p in CSS("#comm-membership ul li").match(self.root): name = p.text_content().strip() role = "member" com.add_member(name, role) except SelectorError: pass # some committees have temporary addresses, others have permanent ones try: temp, room, zip = XPath( "//section[@id='comm-addr']/div[@class='mod-inner']//text()" ).match(self.root) com.extras["address"] = f"{temp}: {room}; {zip}" except ValueError: room, zip = XPath( "//section[@id='comm-addr']/div[@class='mod-inner']//text()" ).match(self.root) com.extras["address"] = f"{room}; {zip}" # some committees have press releases try: news_link = CSS("#page-content .read-more").match(self.root)[0].get("href") com.add_link(news_link) except SelectorError: pass return com
def process_item(self, item): dd_text = XPath(".//dd/text()").match(item) district = dd_text[2].strip().split()[1] party = dd_text[4].strip() url = str(XPath(".//dd/a[1]/@href").match_one(item)) if "Details" not in url: raise SkipItem(f"skipping {url}") return PersonDetail( dict( chamber="upper" if "senate" in self.source.url else "lower", district=district, party=party, ), source=url, )
def process_page(self): com = self.input try: chair = (XPath("//h5[text()='Chair']").match_one( self.root).getnext().text_content().strip()) chair = re.search(r"(Senator|Representative)\s(.+)", chair).groups()[1] com.add_member(chair, "Chair") except SelectorError: pass try: vice_chair = (XPath("//h5[text()='Vice-Chair']").match_one( self.root).getnext().text_content().strip()) vice_chair = re.search(r"(Senator|Representative)\s(.+)", vice_chair).groups()[1] com.add_member(vice_chair, "Vice-Chair") except SelectorError: pass try: additional_members = ( XPath("//h5[text()='Additional Members']").match_one( self.root).getnext().getchildren()) for member in additional_members: member = member.text_content().strip() member = re.search(r"(Senator|Representative)\s(.+)", member).groups()[1] com.add_member(member, "member") except SelectorError: pass try: extra_info = CSS("section.content strong").match(self.root) for title in extra_info: position = title.text_content().strip() name = title.tail.strip().lstrip(":").strip() com.extras[position] = name except SelectorError: pass if not com.members: raise SkipItem("empty committee") return com
def process_item(self, item): comm_name = item.text_content().strip() com = ScrapeCommittee( name=comm_name, classification="committee", chamber=self.chamber, ) detail_link = item.get("href") com.add_source(self.source.url) # detail links for Joint Committees are hidden # "javascript:__doPostBack('ctl00$ContentPlaceHolder1$gvJICommittees','cmdCommittee$0')" if self.chamber != "legislature": com.add_source(detail_link) com.add_link(detail_link, note="homepage") return CommDetail(com, source=detail_link) else: raise SkipItem("joint committee")
def process_page(self): com = self.input try: members = CSS("div#CommitteeMembers tbody tr").match(self.root) except SelectorError: self.logger.warning(f"skipping members for {self.source.url}") raise SkipItem("empty") for member in members: dirty_name = CSS("a").match_one(member).text_content().strip().split(", ") last_name = dirty_name[0] first_name = dirty_name[1] name = first_name + " " + last_name try: role = XPath("//td/text()[3]").match(member)[0].strip() except SelectorError: role = "member" com.add_member(name, role) return com
def process_page(self): com = self.input members = CSS( "div .wpb_column.vc_column_container.col-xs-mobile-fullwidth.col-sm-6 div .row-equal-height.hcode-inner-row" ).match(self.root) if not members: raise SkipItem("empty committee") for member in members: name = CSS("strong").match_one(member).text_content().strip() name = re.search(r"(Sen\.|Rep\.)\s(.+)", name).groups()[1] if re.search(r"Ad\sHoc", name): name, _, role = re.search(r"(.+)(\s–\s|\()(Ad\sHoc)\)?", name).groups() if re.search(r",\s", name): name, role = re.search(r"(.+),\s(.+)", name).groups() com.add_member(name, role if role else "member") return com
def process_page(self): district = ( XPath("//*[@id='main-info']/p/span[contains(text(), 'District')]") .match_one(self.root) .getnext() ) district = XPath("text()").match(district)[0].strip() # https://legislature.maine.gov/house/house/MemberProfiles/Details/1193 has no district if district != "": district = re.search(r"(\d+)\s+-", district).groups()[0] else: raise SkipItem("non-voting member") p = ScrapePerson( name=self.input.name, state="me", chamber="lower", district=district, party=self.input.party, ) p.add_source(self.input.source) p.add_source(self.source.url) p.add_link(self.source.url, note="homepage") img = CSS("img.drop-shadow").match_one(self.root).get("src") p.image = img email = CSS("div#main-info p a").match(self.root)[0].text_content().strip() p.email = email distr_addr = CSS("div#main-info p br").match(self.root)[0].tail.strip() p.district_office.address = distr_addr try: work_phone = ( XPath("//*[@id='main-info']/p/span[contains(text(), 'Work')]") .match_one(self.root) .getnext() ) work_phone = work_phone.text_content().strip() p.district_office.voice = work_phone except SelectorError: pass try: cell_phone = ( XPath("//*[@id='main-info']/p/span[contains(text(), 'Cell')]") .match_one(self.root) .getnext() ) cell_phone = cell_phone.text_content().strip() p.extras["Cell Phone"] = cell_phone except SelectorError: pass seat_no = ( XPath("//*[@id='main-info']/p/span[contains(text(), 'Seat')]") .match_one(self.root) .getnext() ) seat_no = seat_no.text_content().strip() p.extras["Seat Number"] = seat_no towns = ( XPath("//*[@id='main-info']/p/span[contains(text(), 'Town(s)')]") .match_one(self.root) .getnext() ) towns = XPath("text()").match(towns) if len(towns) > 0 and towns[0].strip() != "": p.extras["towns represented"] = [] for town in towns: p.extras["towns represented"] += [town.strip()] cap_addr = XPath( "//*[@id='welcome']/div/div[1]/div/div/table/tbody/tr[3]/td[2]/text()" ).match(self.root) capitol_address = "" for line in cap_addr: capitol_address += line.strip() capitol_address += " " p.capitol_office.address = capitol_address.strip() try: occupation = ( XPath( "//*[@id='welcome']/div/div[1]/div/div/table/tbody/tr/td[contains(text(), 'Occupation')]" ) .match_one(self.root) .getnext() ) p.extras["occupation"] = occupation.text_content().strip() except SelectorError: pass try: education = ( XPath( "//*[@id='welcome']/div/div[1]/div/div/table/tbody/tr/td[contains(text(), 'Education')]" ) .match_one(self.root) .getnext() ) if education is not None: p.extras["education"] = education.text_content().strip() except SelectorError: pass return p
def process_item(self, item): # skip header and empty rows if item[0] == "Dist" or item[0] is None: self.skip() first_name = item[3].strip() last_name = item[4].strip() name = first_name + " " + last_name district = item[0] party = item[2].strip() if not district: # non voting members ignored for now raise SkipItem(f"not voting member {name}") p = ScrapePerson( name=name, state="me", chamber="upper", district=district, party=party, ) p.given_name = first_name p.family_name = last_name detail_link = URL(f"https://legislature.maine.gov/District-{district}") p.add_source(self.source.url) p.add_source(detail_link.url) p.add_link(detail_link.url, note="homepage") county = item[1].strip() p.extras["county represented"] = county mailing_address = item[5].strip() city = item[6].strip() zip = item[8].strip() address = mailing_address + ", " + city + ", ME " + zip if re.search(r"St(\s|,)", address): address = re.sub(r"St\s", "Street ", address) address = re.sub(r"St,", "Street,", address) if re.search(r"PO\s", address): address = re.sub(r"PO\s", "P.O. ", address) if re.search(r"Saint\s", address): address = re.sub(r"Saint\s", "St. ", address) if re.search(r"N\.", address): address = re.sub(r"N\.", "North", address) p.district_office.address = address phone = item[9].strip() phone = "(207) " + phone p.district_office.voice = phone alternate = item[10] if alternate is not None: p.extras["alternate phone"] = alternate.strip() email = item[11].strip() p.email = email return SenDetail(p, source=detail_link)
def process_page(self): if self.input % 2: raise SkipItem(f"{self.input} is odd!") else: return self.input
def process_item(self, item): if item % 2: raise SkipItem(f"{item} is odd!") else: return item