def handle_page(self): # don't use handle_page_item because we need to look back at prior element parent = None for item in self.doc.xpath(self.list_xpath): cssclass = item.attrib.get("class", "") name = item.text_content().strip() if "parentcommittee" in cssclass: parent = None chamber = "lower" comm = Organization(name=name, classification="committee", chamber=chamber, parent_id=parent) yield self.scrape_page(HouseComDetail, item.attrib["href"], obj=comm) # parent for next time if "parentcommittee" in cssclass: parent = comm._id chamber = None
def scrape_lower_committee(self, link, name): url = re.sub(r"\s+", "", link.attrib["href"]) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) comm = Organization(name=name, chamber="lower", classification="committee") comm.add_source(url) xpath = '//a[contains(@href, "?member=")]' for link in doc.xpath(xpath): name = link.text_content().strip() name = re.sub(r"^Delegate\s+", "", name) role = link.getnext().text or "member" comm.add_member(name, role.strip()) return comm
def scrape_committee(self, term, href, name): page = self.get(href).text page = lxml.html.fromstring(page) page.make_links_absolute(href) members = page.xpath("//div[@class='view-content']" "//a[contains(@href, 'members')]") if "/joint/" in href: chamber = "legislature" elif "/senate/" in href: chamber = "upper" elif "/house/" in href: chamber = "lower" else: # interim committees and others were causing duplicate committee issues, skipping self.warning( "Failed to identify chamber for {}; skipping".format(href)) return cttie = Organization(name, chamber=chamber, classification="committee") for a in members: member = a.text role = a.xpath( "ancestor::div/h2[@class='pane-title']/text()")[0].strip() role = { "Legislative Members": "member", "Chairman": "chair", "Vice Chairman": "member", }[role] if member is None or member.startswith("District"): continue member = member.replace("Senator ", "").replace("Representative ", "") cttie.add_member(member, role=role) cttie.add_source(href) yield cttie
def scrape_committees(self, session): session_key = SESSION_KEYS[session] committees_response = self.api_client.get("committees", session=session_key) legislators = index_legislators(self, session_key) for committee in committees_response: org = Organization( chamber={"S": "upper", "H": "lower", "J": "legislature"}[ committee["HouseOfAction"] ], name=committee["CommitteeName"], classification="committee", ) org.add_source( "https://olis.leg.state.or.us/liz/{session}" "/Committees/{committee}/Overview".format( session=session_key, committee=committee["CommitteeName"] ) ) members_response = self.api_client.get( "committee_members", session=session_key, committee=committee["CommitteeCode"], ) for member in members_response: try: member_name = legislators[member["LegislatorCode"]] except KeyError: logger.warn( "Legislator {} not found in session {}".format( member["LegislatorCode"], session_key ) ) member_name = member["LegislatorCode"] org.add_member( member_name, role=member["Title"] if member["Title"] else "" ) yield org
def _scrape_lower_standing_committee(self, committee_name, url): page = self.lxmlize(url) committee = Organization( committee_name, chamber="lower", classification="committee" ) committee.add_source(url) rows = page.xpath( '//table[@id="body_ListView1_itemPlaceholderContainer"]' '/tr[@class="linkStyle2"]' ) for row in rows: member_name = row.xpath("normalize-space(string(./td[1]/a))") member_name = self._normalize_member_name(member_name) member_role = row.xpath("normalize-space(string(./td[2]))") member_role = self._normalize_member_role(member_role) committee.add_member(member_name, member_role) yield committee
def scrape_committee(self, chamber, link, parent_comm=None): home_link = link.attrib["href"] name = re.sub(r"\s+\((H|S)\)$", "", link.text).strip().title() name = name.replace(".", "").strip() if "Subcommittee " in name and parent_comm: name = name.split("Subcommittee")[1] name = name.replace(" on ", "").replace(" On ", "") name = name.strip() comm = Organization(name, parent_id=self.parents[parent_comm], classification="committee") else: for c in ["Committee", "Comm", "Sub", "Subcommittee"]: if name.endswith(c): name = name[:-1 * len(c)].strip() comm = Organization(name, chamber=chamber, classification="committee") self.parents[name] = comm._id comm.add_source(home_link) comm_url = home_link.replace("home.htm", "members.htm") self.scrape_members(comm, comm_url) if comm._related: yield comm else: self.logger.warning("Empty committee, skipping.") # deal with subcommittees if parent_comm is None: # checking parent_comm so we don't look for subcommittees # in subcommittees leaving us exposed to infinity page = self.get(home_link).text page = lxml.html.fromstring(page) page.make_links_absolute(home_link) sub_links = page.xpath("//li/a[contains(@href, '/home.htm')]") for l in sub_links: if "committee" in l.text.lower(): yield from self.scrape_committee(chamber, l, name)
def scrape_upper_committee(self, name, url): page = lxml.html.fromstring(self.get(url).text) comm = Organization(name=name, chamber="upper", classification="committee") comm.add_source(url) for link in page.xpath("//a[contains(@href, 'biographies')]"): member = link.xpath("string()").strip() member = re.sub(r"\s+", " ", member) if not member: continue role = link.tail if not role: role = "member" elif "Vice Chair" in role: role = "vice chair" elif "Chair" in role: role = "chair" member = member.replace("Senator ", "") comm.add_member(member, role=role) if not comm._related: raise Exception("no members for %s", comm.name) yield comm
def test_vote_event_bill_actions_two_stage(): # this test is very similar to what we're testing in test_vote_event_bill_actions w/ # ve3 and ve4, that two bills that reference the same action won't conflict w/ the # OneToOneField, but in this case we do it in two stages so that the conflict is found # even if the votes weren't in the same scrape j = create_jurisdiction() j.legislative_sessions.create(name="1900", identifier="1900") org1 = ScrapeOrganization(name="House", classification="lower") bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", from_organization=org1._id) bill.add_action(description="passage", date="1900-04-02", chamber="lower") ve1 = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-02", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", bill_action="passage", organization=org1._id, ) ve2 = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-02", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", bill_action="passage", organization=org1._id, ) # disambiguate them ve1.pupa_id = "one" ve2.pupa_id = "two" oi = OrganizationImporter("jid") oi.import_data([org1.as_dict()]) bi = BillImporter("jid", oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) # first imports just fine VoteEventImporter("jid", DumbMockImporter(), oi, bi).import_data([ve1.as_dict()]) votes = list(VoteEvent.objects.all()) assert len(votes) == 1 assert votes[0].bill_action is not None # when second is imported, ensure that action stays pinned to first just as it would # have if they were both in same import VoteEventImporter("jid", DumbMockImporter(), oi, bi).import_data([ve1.as_dict(), ve2.as_dict()]) votes = list(VoteEvent.objects.all()) assert len(votes) == 2 assert votes[0].bill_action is not None assert votes[1].bill_action is None
def test_vote_event_bill_actions(): j = create_jurisdiction() j.legislative_sessions.create(name="1900", identifier="1900") org1 = ScrapeOrganization(name="House", classification="lower") org2 = ScrapeOrganization(name="Senate", classification="upper") bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", from_organization=org1._id) # add actions, passage of upper & lower on same day, something else, # then passage in upper again on a different day bill.add_action(description="passage", date="1900-04-01", chamber="upper") bill.add_action(description="passage", date="1900-04-01", chamber="lower") bill.add_action(description="other event", date="1900-04-01", chamber="lower") bill.add_action(description="passage", date="1900-04-02", chamber="upper") # four passage votes, one per chamber, one on 04-01, and one on 04-02 ve1 = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-01", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", bill_action="passage", organization=org1._id, ) ve2 = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-01", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", bill_action="passage", organization=org2._id, ) ve3 = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-02", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", bill_action="passage", organization=org1._id, ) ve4 = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-02", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", bill_action="passage", organization=org2._id, ) oi = OrganizationImporter("jid") oi.import_data([org1.as_dict(), org2.as_dict()]) bi = BillImporter("jid", oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) VoteEventImporter("jid", DumbMockImporter(), oi, bi).import_data( [ve1.as_dict(), ve2.as_dict(), ve3.as_dict(), ve4.as_dict()]) bill = Bill.objects.get() votes = list(VoteEvent.objects.all()) actions = list(bill.actions.all()) assert len(actions) == 4 assert len(votes) == 4 votes = {(v.organization.classification, v.start_date): v.bill_action for v in votes} # ensure that votes are matched using action, chamber, and date assert votes[("upper", "1900-04-01")] == actions[0] assert votes[("lower", "1900-04-01")] == actions[1] assert votes[("upper", "1900-04-02")] == actions[3] assert votes[("lower", "1900-04-02")] is None
def scrape_chamber(self, chamber): committee_list_urls = { "lower": "https://capitol.texas.gov/Committees/" "CommitteesMbrs.aspx?Chamber=H", "upper": "https://capitol.texas.gov/Committees/" "CommitteesMbrs.aspx?Chamber=S", } committee_list_url = committee_list_urls[chamber] committee_list_page = self.lxmlize(committee_list_url) committee_nodes = self.get_nodes( committee_list_page, '//form[@id="ctl00"]//a[@id="CmteList"]') for committee_node in committee_nodes: committee_name = committee_node.text.strip() committee = Organization(name=committee_name, chamber=chamber, classification="committee") # Get the committee profile page. committee_page_url = committee_node.get("href") committee_page = self.lxmlize(committee_page_url) # Capture table with committee membership data. details_table = self.get_node(committee_page, '//div[@id="content"]//table[2]') if details_table is not None: # Skip the first row because it currently contains only headers detail_rows = self.get_nodes(details_table, "./tr")[1:] for detail_row in detail_rows: label_text = self.get_node(detail_row, "./td[1]//text()") if label_text: label_text = label_text.strip().rstrip(":") if label_text in ("Chair", "Vice Chair"): member_role = "chair" else: member_role = "member" member_name_text = self.get_node(detail_row, "./td[2]/a/text()") # Clean titles from member names. if chamber == "upper": member_name = re.sub(r"^Sen\.[\s]*", "", member_name_text) elif chamber == "lower": member_name = re.sub(r"^Rep\.[\s]*", "", member_name_text) # Collapse multiple whitespaces in member names. member_name = re.sub(r"[\s]{2,}", " ", member_name).strip() committee.add_member(member_name, member_role) committee.add_source(committee_list_url) committee.add_source(committee_page_url) yield committee
def get_organizations(self): parent = Organization("Congress", classification="legislature") yield parent yield Organization("House", classification="lower", parent_id=parent) yield Organization("Senate", classification="upper", parent_id=parent)
def scrape_upper(self): # Retrieve index list of committees. url = "http://senate.ca.gov/committees" doc = self.lxmlize(url) standing_committees = doc.xpath( '//h2[text()="Standing Committees"]/../following-sibling::div//a') sub_committees = doc.xpath( '//h2[text()="Sub Committees"]/../following-sibling::div//a') joint_committees = doc.xpath( '//h2[text()="Joint Committees"]/../following-sibling::div//a') other_committees = doc.xpath( '//h2[text()="Other"]/../following-sibling::div//a') # Iterates over each committee [link] found. for committee in (standing_committees + sub_committees + joint_committees + other_committees): # Get the text of the committee link, which should be the name of # the committee. (comm_name, ) = committee.xpath("text()") (comm_url, ) = committee.xpath("@href") comm_doc = self.lxmlize(comm_url) if comm_name.startswith("Joint"): org = Organization(chamber="legislature", classification="committee", name=comm_name) elif comm_name.startswith("Subcommittee"): (parent_name, ) = comm_doc.xpath('//div[@class="banner-sitename"]/a/text()') (subcom_name, ) = comm_doc.xpath('//h1[@class="title"]/text()') org = Organization( name=subcom_name.strip(), classification="committee", parent_id={ "name": parent_name, "classification": "upper" }, ) else: org = Organization(chamber="upper", name=comm_name, classification="committee") org.add_source(comm_url) # Special case of members list being presented in text blob. member_blob = comm_doc.xpath( 'string(//div[contains(@class, "field-item") and ' 'starts-with(text(), "Senate Membership:")][1]/text()[1])') if member_blob: # Separate senate membership from assembly membership. # This should strip the header from assembly membership # string automatically. delimiter = "Assembly Membership:\n" senate_members, delimiter, assembly_members = member_blob.partition( delimiter) # Strip header from senate membership string. senate_members = senate_members.replace( "Senate Membership:\n", "") # Clean membership strings. senate_members = senate_members.strip() assembly_members = assembly_members.strip() # Parse membership strings into lists. senate_members = senate_members.split("\n") assembly_members = assembly_members.split("\n") members = senate_members + assembly_members # Typical membership list format. else: members = comm_doc.xpath( '//a[(contains(@href, "/sd") or ' 'contains(@href, "assembly.ca.gov/a")) and ' '(starts-with(text(), "Senator") or ' 'starts-with(text(), "Assembly Member"))]/text()') for member in members: if not member.strip(): continue (mem_name, mem_role) = re.search( r"""(?ux) ^(?:Senator|Assembly\sMember)\s # Legislator title (.+?) # Capture the senator's full name (?:\s\((.{2,}?)\))? # There may be role in parentheses (?:\s\([RD]\))? # There may be a party affiliation \s*$ """, member, ).groups() org.add_member(mem_name, role=mem_role if mem_role else "member") if not org._related: self.warning( "No members found for committee {}".format(comm_name)) yield org
def scrape_chamber(self, chamber): if chamber == "lower": url = "http://www.scstatehouse.gov/member.php?chamber=H" else: url = "http://www.scstatehouse.gov/member.php?chamber=S" seen_committees = {} data = self.get(url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(url) for a in doc.xpath('//a[@class="membername"]'): full_name = a.text leg_url = a.get("href") if full_name.startswith("Senator"): full_name = full_name.replace("Senator ", "") if full_name.startswith("Representative"): full_name = full_name.replace("Representative ", "") leg_html = self.get(leg_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) if "Resigned effective" in leg_html: self.info("Resigned") continue party, district, _ = leg_doc.xpath( '//p[@style="font-size: 17px;' ' margin: 0 0 0 0; padding: 0;"]/text()') if "Republican" in party: party = "Republican" elif "Democrat" in party: party = "Democratic" # District # - County - Map district = district.split()[1] try: photo_url = leg_doc.xpath( '//img[contains(@src,"/members/")]/@src')[0] except IndexError: self.warning("No Photo URL for {}".format(full_name)) photo_url = "" person = Person( name=full_name, district=district, party=party, primary_org=chamber, image=photo_url, ) # capitol office address try: capitol_address = lxml.etree.tostring( leg_doc.xpath('//h2[text()="Columbia Address"]/../p[1]') [0]).decode() if capitol_address: capitol_address = parse_address(capitol_address) person.add_contact_detail(type="address", value=capitol_address, note="Capitol Office") except IndexError: self.warning("no capitol address for {0}".format(full_name)) # capitol office phone try: capitol_phone = ( leg_doc.xpath('//h2[text()="Columbia Address"]/../p[2]') [0].text_content().strip()) label, number = parse_phone(capitol_phone) if number: person.add_contact_detail(type="voice", value=number, note="Capitol Office") except IndexError: self.warning("no capitol phone for {0}".format(full_name)) # home address try: home_address = lxml.etree.tostring( leg_doc.xpath('//h2[text()="Home Address"]/../p[1]') [0]).decode() if home_address: home_address = parse_address(home_address) person.add_contact_detail(type="address", value=home_address, note="District Office") except IndexError: self.warning("no home address for {0}".format(full_name)) # home or business phone try: home_phone = ( leg_doc.xpath('//h2[text()="Home Address"]/../p[2]') [0].text_content().strip()) label, number = parse_phone(home_phone) if number: label = ("Primary Office" if label == "Business" else "District Office") person.add_contact_detail(type="voice", value=number, note=label) except IndexError: self.warning( "no home or business phone for {0}".format(full_name)) # business or home phone try: business_phone = ( leg_doc.xpath('//h2[text()="Home Address"]/../p[3]') [0].text_content().strip()) label, number = parse_phone(business_phone) if number: label = ("Primary Office" if label == "Business" else "District Office") person.add_contact_detail(type="voice", value=number, note=label) except IndexError: pass person.add_link(leg_url) person.add_source(url) person.add_source(leg_url) # committees (skip first link) for com in leg_doc.xpath( '//a[contains(@href, "committee.php")]')[1:]: if com.text.endswith(", "): committee, role = com.text_content().rsplit(", ", 1) # known roles role = { "Treas.": "treasurer", "Secy.": "secretary", "Secy./Treas.": "secretary/treasurer", "V.C.": "vice-chair", "1st V.C.": "first vice-chair", "Co 1st V.C.": "co-first vice-chair", "2nd V.C.": "second vice-chair", "3rd V.C.": "third vice-chair", "Ex.Officio Member": "ex-officio member", "Chairman": "chairman", }[role] else: committee = com.text role = "member" # only yield each committee once if committee not in seen_committees: com = Organization(name=committee, classification="committee", chamber=chamber) com.add_source(url) seen_committees[committee] = com yield com else: com = seen_committees[committee] person.add_membership(com, role=role) yield person
def _scrape_lower_chamber(self, session): self.info("Scraping lower chamber for committees.") chamber = "lower" url = "{base}CommitteeHierarchy.aspx".format(base=self._reps_url_base) page_string = self.get(url).text page = lxml.html.fromstring(page_string) # Last tr has the date committee_links = page.xpath("//li//a") for committee_link in committee_links: committee_name = committee_link.text_content().strip() committee_url = committee_link.attrib.get("href") committee_url = "{base}{members}{url}".format( base=self._reps_url_base, members= "MemberGridCluster.aspx?filter=compage&category=committee&", url=committee_url, ) actual_chamber = chamber if "joint" in committee_name.lower(): actual_chamber = "legislature" committee_name = committee_name.replace("Committee On ", "") committee_name = committee_name.replace("Special", "") committee_name = committee_name.replace("Select", "") committee_name = committee_name.replace("Special", "") committee_name = committee_name.replace("Joint", "") committee_name = committee_name.replace(" Committee", "") committee_name = committee_name.strip() committee = Organization(committee_name, chamber=actual_chamber, classification="committee") committee_page_string = self.get(committee_url).text committee_page = lxml.html.fromstring(committee_page_string) # First tr has the title (sigh) mem_trs = committee_page.xpath( "//table[@id='gvMembers_DXMainTable']//tr[contains(@class, 'dxgvDataRow')]" ) for mem_tr in mem_trs: mem_code = None mem_links = mem_tr.xpath("td/a[1]") mem_role_string = mem_tr.xpath( "td[4]")[0].text_content().strip() if len(mem_links): mem_code = mem_links[0].attrib.get("href") # Output is "Rubble, Barney, Neighbor" mem_parts = mem_tr.xpath( "td[2]")[0].text_content().strip().split(",") if self._no_members_text in mem_parts: continue mem_name = mem_parts[1].strip() + " " + mem_parts[0].strip() # Sometimes Senator abbreviation is in the name mem_name = mem_name.replace("Sen. ", "") mem_name = mem_name.replace("Rep. ", "") mem_role = "member" if len(mem_role_string) > 2: mem_role = mem_role_string.lower() membership = committee.add_member(mem_name, role=mem_role) membership.extras = {"code": mem_code} committee.add_source(url) committee.add_source(committee_url) yield committee
def scrape(self): session = self.latest_session() subcomms = self.get_subcommittee_info(session) api_base_url = "https://api.iga.in.gov" html_base_url = "http://iga.in.gov/legislative/{}/committees/".format( session) client = ApiClient(self) r = client.get("committees", session=session) all_pages = client.unpaginate(r) for comm_info in all_pages: # this is kind of roundabout, but needed in order # to take advantage of all of our machinery to make # sure we're not overloading their api comm_link = comm_info["link"] comm_name = comm_link.split("/")[-1] if "withdrawn" in comm_name or "conference" in comm_name: continue try: comm_json = client.get("committee", committee_link=comm_link[1:]) except HTTPError: self.logger.warning("Page does not exist") continue try: chamber = comm_json["chamber"]["name"] except KeyError: chamber = "joint" else: if chamber == "Senate": chamber = "upper" elif chamber == "House": chamber = "lower" else: raise AssertionError( "Unknown committee chamber {}".format(chamber)) name = comm_json["name"] try: owning_comm = subcomms[name] except KeyError: name = name.replace("Statutory Committee on", "").strip() comm = Organization(name=name, chamber=chamber, classification="committee") if name in subcomms.values(): # Avoid identification issues, if committee names are re-used # between upper and lower chambers assert self._parent_committees.get(name) is None self._parent_committees[name] = comm else: name = (name.replace("Statutory Committee on", "").replace("Subcommittee", "").strip()) comm = Organization( name=name, parent_id=self._parent_committees[owning_comm], classification="committee", ) chair = self.process_special_members(comm, comm_json, "chair") vicechair = self.process_special_members(comm, comm_json, "viceChair") ranking = self.process_special_members(comm, comm_json, "rankingMinMember") # leadership is also listed in membership # so we have to make sure we haven't seen them yet comm_members = [m for m in [chair, vicechair, ranking] if m] for mem in comm_json["members"]: mem_name = mem["firstName"] + " " + mem["lastName"] if mem_name not in comm_members: comm_members.append(mem_name) comm.add_member(mem_name) api_source = api_base_url + comm_link if comm_name[:10] == "committee_": html_source = html_base_url + comm_name[10:] comm.add_source(html_source) comm.add_source(api_source) yield comm
def scrape_comm(self, url, chamber): data = self.post(url).json()["Data"] for item in data: comm_name = item["CommitteeName"] committee = Organization(name=comm_name, chamber=chamber, classification="committee") chair_man = str(item["ChairName"]) vice_chair = str(item["ViceChairName"]) comm_id = item["CommitteeId"] comm_url = self.get_comm_url(chamber, comm_id, comm_name) members = self.scrape_member_info(comm_url) if vice_chair != "None": committee.add_member(vice_chair, role="Vice-Chair") if chair_man != "None": committee.add_member(chair_man, role="Chairman") for member in members: # vice_chair and chair_man already added. if chair_man not in member and vice_chair not in member: member = " ".join(member.split()) if member: committee.add_member(member) committee.add_source(comm_url) committee.add_source(url) yield committee
def scrape_committees_pdf(self, year, chamber, filename, url): if chamber == "lower" and year == "2015": text = self._fix_house_text(filename).decode() else: text = convert_pdf(filename, type="text-nolayout").decode() for hotgarbage, replacement in ( ( r"Judicial Branch, Law Enforcement,\s+and\s+Justice", "Judicial Branch, Law Enforcement, and Justice", ), ( r"Natural Resources and\s+Transportation", "Natural Resources and Transportation", ), ( r"(?u)Federal Relations, Energy,?\s+and\s+Telecommunications", "Federal Relations, Energy, and Telecommunications", ), ): text = re.sub(hotgarbage, replacement, text) lines = iter(text.splitlines()) # Drop any lines before the ag committee. lines = dropwhile(lambda s: "Agriculture" not in s, lines) comm = None for line in lines: # Replace Unicode variants with ASCII equivalents line = line.replace(" ", " ").replace("‐", "-") if "Subcommittees" in line: self.warning("Currently, we're skipping subcommittees") # https://github.com/openstates/openstates/issues/2099 break if is_committee_name(line): if comm and comm._related: yield comm committee = line.strip() comm = Organization(name=committee, chamber=chamber, classification="committee") comm.add_source(url) elif is_legislator_name(line): name, party = line.rsplit("(", 1) name = name.strip().replace("Rep. ", "").replace("Sen. ", "") if re.search(" Ch", party): role = "chair" elif " VCh" in party: role = "vice chair" elif " MVCh" in party: role = "minority vice chair" else: role = "member" comm.add_member(name, role) if comm._related: yield comm
def test_full_bill(): create_jurisdiction() sp = ScrapePerson("Adam Smith") org = ScrapeOrganization(name="House", classification="lower") com = ScrapeOrganization( name="Arbitrary Committee", classification="committee", parent_id=org._id ) oldbill = ScrapeBill( "HB 99", "1899", "Axe & Tack Tax Act", classification="tax bill", from_organization=org._id, ) bill = ScrapeBill( "HB 1", "1900", "Axe & Tack Tax Act", classification="tax bill", from_organization=org._id, ) bill.subject = ["taxes", "axes"] bill.add_identifier("SB 9") bill.add_title("Tack & Axe Tax Act") bill.add_action("introduced in house", "1900-04-01", chamber="lower") act = bill.add_action("sent to arbitrary committee", "1900-04-04", chamber="lower") act.add_related_entity("arbitrary committee", "organization", com._id) bill.add_related_bill( "HB 99", legislative_session="1899", relation_type="prior-session" ) bill.add_sponsorship( "Adam Smith", classification="extra sponsor", entity_type="person", primary=False, entity_id=sp._id, ) bill.add_sponsorship( "Jane Smith", classification="lead sponsor", entity_type="person", primary=True ) bill.add_abstract( "This is an act about axes and taxes and tacks.", note="official", date="1969-10-20", ) bill.add_document_link( "Fiscal Note", "http://example.com/fn.pdf", media_type="application/pdf" ) bill.add_document_link( "Fiscal Note", "http://example.com/fn.html", media_type="text/html" ) bill.add_version_link( "Fiscal Note", "http://example.com/v/1", media_type="text/html" ) bill.add_source("http://example.com/source") # import bill oi = OrganizationImporter("jid") oi.import_data([org.as_dict(), com.as_dict()]) pi = PersonImporter("jid") pi.import_data([sp.as_dict()]) BillImporter("jid", oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier="HB 1") assert b.from_organization.classification == "lower" assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ["taxes", "axes"] assert b.abstracts.get().note == "official" assert b.abstracts.get().date == "1969-10-20" # other_title, other_identifier added assert b.other_titles.get().title == "Tack & Axe Tax Act" assert b.other_identifiers.get().identifier == "SB 9" # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get(classification="lower") assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert actions[1].related_entities.get().organization == Organization.objects.get( classification="committee" ) # related_bills were added rb = b.related_bills.get() assert rb.identifier == "HB 99" # and bill got resolved assert rb.related_bill.identifier == "HB 99" # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 person = Person.objects.get(name="Adam Smith") for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def __missing__(self, key): val = Organization(chamber="legislature", name=key, classification="committee") self[key] = val return val
def test_vote_event_bill_actions_errors(): j = create_jurisdiction() j.legislative_sessions.create(name="1900", identifier="1900") org1 = ScrapeOrganization(name="House", classification="lower") org2 = ScrapeOrganization(name="Senate", classification="upper") bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", from_organization=org1._id) # for this bill, two identical actions, so vote matching will fail bill.add_action(description="passage", date="1900-04-01", chamber="lower") bill.add_action(description="passage", date="1900-04-01", chamber="lower") # this action is good, but two votes will try to match it bill.add_action(description="passage", date="1900-04-02", chamber="lower") # will match two actions ve1 = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-01", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", identifier="1", bill_action="passage", organization=org1._id, ) # will match no actions ve2 = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-01", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", identifier="2", bill_action="committee result", organization=org1._id, ) # these two votes will both match the same action ve3 = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-02", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", identifier="3", bill_action="passage", organization=org1._id, ) ve4 = ScrapeVoteEvent( legislative_session="1900", motion_text="passage-syz", start_date="1900-04-02", classification="passage:bill", result="fail", bill_chamber="lower", bill="HB 1", identifier="4", bill_action="passage", organization=org1._id, ) oi = OrganizationImporter("jid") oi.import_data([org1.as_dict(), org2.as_dict()]) bi = BillImporter("jid", oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) VoteEventImporter("jid", DumbMockImporter(), oi, bi).import_data( [ve1.as_dict(), ve2.as_dict(), ve3.as_dict(), ve4.as_dict()]) bill = Bill.objects.get() votes = list(VoteEvent.objects.all().order_by("identifier")) # isn't matched, was ambiguous across two actions assert votes[0].bill_action is None # isn't matched, no match in actions assert votes[1].bill_action is None # these both try to match the same action, only first will succeed assert votes[2].bill_action is not None assert votes[3].bill_action is None
def scrape_current(self, chamber): if chamber == "upper": chambers = ["special_committees", "senate_committees"] else: chambers = ["house_committees"] committee_request = self.get(ksapi.url + "ctte/").text committee_json = json.loads(committee_request) for com_type in chambers: committees = committee_json["content"][com_type] for committee_data in committees: # set to joint if we are using the special_committees com_chamber = ("legislature" if com_type == "special_committees" else chamber) committee = Organization( committee_data["TITLE"], chamber=com_chamber, classification="committee", ) com_url = ksapi.url + "ctte/%s/" % committee_data["KPID"] try: detail_json = self.get(com_url).text except scrapelib.HTTPError: self.warning("error fetching committee %s" % com_url) continue details = json.loads(detail_json)["content"] for chair in details["CHAIR"]: if chair.get("FULLNAME", None): chair_name = chair["FULLNAME"] else: chair_name = self.parse_kpid(chair["KPID"]) self.warning("no FULLNAME for %s", chair["KPID"]) committee.add_member(chair_name, "chairman") for vicechair in details["VICECHAIR"]: committee.add_member(vicechair["FULLNAME"], "vice-chairman") for rankedmember in details["RMMEM"]: committee.add_member(rankedmember["FULLNAME"], "ranking member") for member in details["MEMBERS"]: committee.add_member(member["FULLNAME"]) if not committee._related: self.warning("skipping blank committee %s" % committee_data["TITLE"]) else: committee.add_source(com_url) yield committee
def test_basic_invalid_organization(): orga = Organization("name") # no source with pytest.raises(ScrapeValueError): orga.validate()
def _scrape_upper_chamber(self, session): self.info("Scraping upper chamber for committees.") chamber = "upper" if self._is_post_2015 and self.latest_session() != session: url = "{base}{year}web/standing-committees".format( base=self._senate_url_base, year=session[2:]) comm_container_id = "primary" elif session == self.latest_session(): url = "{base}standing-committees".format( base=self._senate_url_base) comm_container_id = "primary" else: url = "{base}{year}info/com-standing.htm".format( base=self._senate_url_base, year=session[2:]) comm_container_id = "mainContent" page = self.lxmlize(url) comm_links = self.get_nodes( page, '//div[@id = "{}"]//p/a'.format(comm_container_id)) for comm_link in comm_links: # Normalize to uppercase - varies between "Assigned bills" and "Assigned Bills" if "ASSIGNED BILLS" in comm_link.text_content().upper(): continue comm_link = comm_link.attrib["href"] if self._is_post_2015: if "web" not in comm_link: continue else: if "comm" not in comm_link: continue comm_page = self.lxmlize(comm_link) if self._is_post_2015: comm_name = self.get_node(comm_page, '//h1[@class="entry-title"]/text()') members = self.get_nodes( comm_page, '//div[@id="bwg_standart_thumbnails_0"]/a') else: comm_name = self.get_node(comm_page, '//div[@id="mainContent"]/p/text()') members = self.get_nodes(comm_page, '//div[@id="mainContent"]//td/a') comm_name = comm_name.replace(" Committee", "") comm_name = comm_name.strip() committee = Organization(comm_name, chamber=chamber, classification="committee") for member in members: mem_link = member.attrib.get("href", "") if "mem" not in mem_link: continue if self._is_post_2015: mem_parts = self.get_node( member, './/span[@class="bwg_title_spun2_0"]') mem_parts = member.text_content().strip().split(",") # Senator title stripping mainly for post-2015. mem_name = re.sub(r"^Senator[\s]+", "", mem_parts[0]) # this one time, MO forgot the comma between # the member and his district. Very rarely relevant try: int(mem_name[-4:-2] ) # the district's # is in this position except ValueError: pass else: mem_name = " ".join( mem_name.split(" ")[0:-1]) # member name fixed # ok, so this next line. We don't care about # the first 2 elements of mem_parts anymore # so whatever. But if the member as a role, we want # to make sure there are 3 elements in mem_parts and # the last one is actually the role. This sucks, sorry. mem_parts.append(mem_parts[-1]) mem_role = "member" if len(mem_parts) > 2: mem_role = mem_parts[2].lower().split(" ")[0].strip() if mem_name == "": continue committee.add_member(mem_name, role=mem_role) committee.add_source(url) committee.add_source(comm_link) yield committee
def test_no_source_on_party_org(): org = Organization("Hat", classification="party") # no source? no problem because classification = party org.validate()
def scrape(self): # chambers = [chamber] if chamber is not None else ['upper', 'lower'] leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" page = self.get(leg_url) committees = {} # Ensure that the spreadsheet's structure hasn't generally changed _row_headers = page.text.split("\r\n")[0].replace('"', "").split(",") assert _row_headers == HEADERS, "Spreadsheet structure may have changed" page = open_csv(page) for row in page: chamber = {"H": "lower", "S": "upper"}[row["office code"]] district = row["dist"].lstrip("0") assert district.isdigit(), "Invalid district found: {}".format( district) name = row["first name"] mid = row["middle initial"].strip() if mid: name += " %s" % mid name += " %s" % row["last name"] suffix = row["suffix"].strip() if suffix: name += " %s" % suffix party = row["party"] if party == "Democrat": party = "Democratic" leg = Person(primary_org=chamber, name=name, district=district, party=party) legislator_url = row["URL"].replace("\\", "//").strip() if legislator_url != "": if not legislator_url.startswith("http"): legislator_url = "http://" leg.add_link(legislator_url) leg.add_party(party=party) office_address = "%s\nRoom %s\nHartford, CT 06106" % ( row["capitol street address"], row["room number"], ) # extra_office_fields = dict() email = row["email"].strip() if "@" not in email: if not email: email = None elif email.startswith("http://") or email.startswith( "https://"): # extra_office_fields['contact_form'] = email email = None else: raise ValueError( "Problematic email found: {}".format(email)) leg.add_contact_detail(type="address", value=office_address, note="Capitol Office") leg.add_contact_detail(type="voice", value=row["capitol phone"], note="Capitol Office") if email: leg.add_contact_detail(type="email", value=email) home_address = "{}\n{}, {} {}".format( row["home street address"], row["home city"], row["home state"], row["home zip code"], ) if "Legislative Office Building" not in home_address: leg.add_contact_detail(type="address", value=home_address, note="District Office") if row["home phone"].strip(): leg.add_contact_detail(type="voice", value=row["home phone"], note="District Office") leg.add_source(leg_url) for comm_name in row["committee member1"].split(";"): if " (" in comm_name: comm_name, role = comm_name.split(" (") role = role.strip(")").lower() else: role = "member" comm_name = comm_name.strip() if comm_name: if comm_name in committees: com = committees[comm_name] else: com = Organization(comm_name, classification="committee", chamber=chamber) com.add_source(leg_url) committees[comm_name] = com yield com leg.add_membership(name_or_org=com, role=role) yield leg
def scrape_committee(self, chamber, name, url, subcommittee=None): name = self._fix_committee_name(name) name = self._fix_committee_case(name) page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) # Get the subcommittee name. xpath = '//div[@class="ms-WPBody"]//table//tr/td/b/text()' if subcommittee: subcommittee = page.xpath(xpath) if subcommittee: subcommittee = page.xpath(xpath).pop(0) subcommittee = self._fix_committee_name( subcommittee, parent=name, subcommittee=True ) subcommittee = self._fix_committee_case(subcommittee) else: subcommittee = None # Dedupe. if (chamber, name, subcommittee) in self._seen: return self._seen.add((chamber, name, subcommittee)) comm = Organization(chamber=chamber, name=name, classification="committee") comm.add_source(url) member_nodes = page.xpath('//table[@class="dxgvTable"]/tr') for member_node in member_nodes: # Skip empty rows. if member_node.attrib["class"] == "dxgvEmptyDataRow": continue mtype = member_node.xpath("string(td[1])").strip() if not mtype: mtype = "member" member = member_node.xpath("string(td[3])").split() member = " ".join(member[1:]) comm.add_member(member, role=mtype) for a in page.xpath( '//table[@id="ctl00_m_g_a194465c_f092_46df_b753_' '354150ac7dbd_ctl00_tblContainer"]//ul/li/a' ): sub_name = a.text.strip() sub_url = a.get("href").replace("../", "/") self.scrape_committee(chamber, name, sub_url, subcommittee=sub_name) if not comm._related: if subcommittee: self.warning("Not saving empty subcommittee {}.".format(subcommittee)) else: self.warning("Not saving empty committee {}.".format(name)) else: yield comm
def scrape_lower(self): url = self.urls["lower"] html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(self.base_urls["lower"]) for type_ in ["Standing", "Select"]: if type_ == "Joint": _chamber = type_.lower() else: _chamber = "lower" for xpath in [ '//div[contains(@class, "view-view-%sCommittee")]' % type_, '//div[contains(@id, "block-views-view_StandingCommittee-block_1")]', '//div[contains(@class, "views-field-title")]', ]: div = doc.xpath(xpath) if div: break div = div[0] committees = div.xpath( 'descendant::span[@class="field-content"]/a/text()') committees = map(strip, committees) urls = div.xpath( 'descendant::span[@class="field-content"]/a/@href') for c, _url in zip(committees, urls): c = c.replace("Committee on ", "").replace(" Committee", "") org = Organization(name=c, chamber=_chamber, classification="committee") self.info(u"Saving {} committee.".format(c)) org.add_source(_url) org.add_source(url) for member, role in self.scrape_lower_members(_url): org.add_member(member, role) _found = False if not org._related: try: for member, role in self.scrape_lower_members( _url + "/membersstaff"): _found = True org.add_member(member, role) if _found: source = _url + "/membersstaff" org.add_source(source) except requests.exceptions.HTTPError: self.error("Unable to access member list for {} " "committee.".format(c)) if org._related: yield org else: self.warning( "No members found for {} committee.".format(c)) # Subcommittees div = doc.xpath('//div[contains(@class, "view-view-SubCommittee")]')[0] for subcom in div.xpath('div/div[@class="item-list"]'): committee = self.get_node(subcom, "h4/text()") if committee is None: continue names = subcom.xpath("descendant::a/text()") names = map(strip, names) urls = subcom.xpath("descendant::a/@href") for n, _url in zip(names, urls): n = re.search(r"^Subcommittee.*?on (.*)$", n).group(1) org = Organization( name=n, parent="lower", classification="committee", parent_id={ "name": committee, "classification": "lower" }, ) org.add_source(_url) org.add_source(url) for member, role in self.scrape_lower_members(_url): org.add_member(member, role) _found = False if not org._related: try: for member, role in self.scrape_lower_members( _url + "/membersstaff"): _found = True org.add_member(member, role) if _found: source = _url + "/membersstaff" org.add_source(source) except requests.exceptions.HTTPError: self.error( "Unable to access member list for {} subcommittee." .format(org.name)) if org._related: yield org else: self.warning("No members found for {} subcommittee of {} " "committee".format(org.name, org._related))
def scrape_joint_committee(self, committee_name, url): if "state.tn.us" in url: com = Organization(committee_name, chamber="legislature", classification="committee") try: page = self.get(url).text except requests.exceptions.ConnectionError: self.logger.warning("Committee link is broken, skipping") return page = lxml.html.fromstring(page) for el in page.xpath( "//div[@class='Blurb']/table//tr[2 <= position() and position() < 10]/td[1]" ): if el.xpath("text()") == ["Vacant"]: continue (member_name, ) = el.xpath("a/text()") if el.xpath("text()"): role = el.xpath("text()")[0].strip(" ,") else: role = "member" member_name = member_name.replace("Senator", "") member_name = member_name.replace("Representative", "") member_name = member_name.strip() com.add_member(member_name, role) com.add_link(url) com.add_source(url) return com elif "gov-opps" in url: com = Organization(committee_name, chamber="legislature", classification="committee") page = self.get(url).text page = lxml.html.fromstring(page) links = ["senate", "house"] for link in links: chamber_link = self.base_href + "/" + link + "/committees/gov-opps.html" chamber_page = self.get(chamber_link).text chamber_page = lxml.html.fromstring(chamber_page) OFFICER_SEARCH = ( '//h2[contains(text(), "Committee Officers")]/' "following-sibling::div/ul/li/a") MEMBER_SEARCH = ('//h2[contains(text(), "Committee Members")]/' "following-sibling::div/ul/li/a") for a in chamber_page.xpath( OFFICER_SEARCH) + chamber_page.xpath(MEMBER_SEARCH): member_name = " ".join( [x.strip() for x in a.xpath(".//text()") if x.strip()]) role = a.xpath("small") if role: role = role[0].xpath("text()")[0].strip() member_name = member_name.replace(role, "").strip() else: role = "member" com.add_member(member_name, role) com.add_source(chamber_link) com.add_link(url) com.add_source(url) return com else: return self._scrape_committee(committee_name, url, "legislature")
def get_organizations(self): yield Organization("Unicameral Legislature", classification="legislature")
def test_full_organization(): create_jurisdictions() org = ScrapeOrganization("United Nations", classification="international") org.add_identifier("un") org.add_name("UN", start_date="1945") org.add_contact_detail(type="phone", value="555-555-1234", note="this is fake") org.add_link("http://example.com/link") org.add_source("http://example.com/source") # import org od = org.as_dict() OrganizationImporter("jid1").import_data([od]) # get person from db and assert it imported correctly o = Organization.objects.get() assert "ocd-organization" in o.id assert o.name == org.name assert o.identifiers.all()[0].identifier == "un" assert o.identifiers.all()[0].scheme == "" assert o.other_names.all()[0].name == "UN" assert o.other_names.all()[0].start_date == "1945" assert o.contact_details.all()[0].type == "phone" assert o.contact_details.all()[0].value == "555-555-1234" assert o.contact_details.all()[0].note == "this is fake" assert o.links.all()[0].url == "http://example.com/link" assert o.sources.all()[0].url == "http://example.com/source"