def test_full_organization(): create_jurisdictions() org = ScrapeOrganization("United Nations", classification="international") org.add_identifier("un") org.add_name("UN", start_date="1945") org.add_contact_detail(type="phone", value="555-555-1234", note="this is fake") org.add_link("http://example.com/link") org.add_source("http://example.com/source") # import org od = org.as_dict() OrganizationImporter("jid1").import_data([od]) # get person from db and assert it imported correctly o = Organization.objects.get() assert "ocd-organization" in o.id assert o.name == org.name assert o.identifiers.all()[0].identifier == "un" assert o.identifiers.all()[0].scheme == "" assert o.other_names.all()[0].name == "UN" assert o.other_names.all()[0].start_date == "1945" assert o.contact_details.all()[0].type == "phone" assert o.contact_details.all()[0].value == "555-555-1234" assert o.contact_details.all()[0].note == "this is fake" assert o.links.all()[0].url == "http://example.com/link" assert o.sources.all()[0].url == "http://example.com/source"
def scrape(self): com_url = "http://dccouncil.us/committees" data = self.get(com_url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(com_url) comms = set(doc.xpath('//a[contains(@href, "dccouncil.us/committees/")]')) for committee in comms: url = committee.attrib["href"] name = committee.text_content().strip() comm_data = self.get(url).text comm_page = lxml.html.fromstring(comm_data) comm_page.make_links_absolute(url) # classify these as belonging to the legislature committee = Organization( name=name, classification="committee", chamber="legislature" ) if comm_page.xpath('//p[@class="page-summary"]'): summary = ( comm_page.xpath('//p[@class="page-summary"]')[0] .text_content() .strip() ) committee.extras["summary"] = summary chair = comm_page.xpath("//h4[text()='Chairperson']/following-sibling::p") chair_name = chair[0].text_content().strip() chair_name = self.remove_title(chair_name) committee.add_member(chair_name, role="chair") members = comm_page.xpath( "//h4[text()='Councilmembers']/following-sibling::ul" ) members = members[0].xpath("./li") for m in members: mem_name = m.text_content().strip() mem_name = self.remove_title(mem_name) if mem_name != chair_name: committee.add_member(mem_name) committee.add_source(url) committee.add_link(url, note="Official Website") if not committee._related: self.warning("empty committee: %s;", name) else: yield committee
def _scrape_committee(self, committee_name, link, chamber): """Scrape individual committee page and add members""" page = self.get(link).text page = lxml.html.fromstring(page) page.make_links_absolute(link) is_subcommittee = bool(page.xpath('//li/a[text()="Committee"]')) if is_subcommittee: # All TN subcommittees are just the name of the parent committee with " Subcommittee" # at the end parent_committee_name = re.sub(r"\s*(Study )?Subcommittee\s*", "", committee_name) com = Organization( committee_name, classification="committee", parent_id=self.parents[parent_committee_name], ) else: com = Organization(committee_name, chamber=chamber, classification="committee") self.parents[committee_name] = com._id OFFICER_SEARCH = ('//h2[contains(text(), "Committee Officers")]/' "following-sibling::div/ul/li/a") MEMBER_SEARCH = ('//h2[contains(text(), "Committee Members")]/' "following-sibling::div/ul/li/a") for a in page.xpath(OFFICER_SEARCH) + page.xpath(MEMBER_SEARCH): member_name = " ".join([ x.strip() for x in a.xpath("text()") + a.xpath("span/text()") if x.strip() ]) role = a.xpath("small") if role: role = role[0].xpath("text()")[0].strip() else: role = "member" if "(Vacant)" in role: continue com.add_member(member_name, role) com.add_link(link) com.add_source(link) return com
def scrape(self, chamber=None): committees_url = "http://le.utah.gov/data/committees.json" committees = self.get(committees_url).json()["committees"] people_url = "http://le.utah.gov/data/legislators.json" people = self.get(people_url).json()["legislators"] # The committee JSON only has legislator IDs, not names ids_to_names = {} for person in people: ids_to_names[person["id"]] = person["formatName"] for committee in committees: name = committee["description"] if name.endswith(" Committee"): name = name[: len(name) - len(" Committee")] elif name.endswith(" Subcommittee"): name = name[: len(name) - len(" Subcommittee")] if name.startswith("House "): name = name[len("House ") :] chamber = "lower" elif name.startswith("Senate "): name = name[len("Senate ") :] chamber = "upper" else: chamber = "legislature" c = Organization(chamber=chamber, name=name, classification="committee") c.add_source(committees_url) c.add_source(people_url) c.add_link(committee["link"]) for member in committee["members"]: try: member_name = ids_to_names[member["id"]] except KeyError: self.warning( "Found unknown legislator ID in committee JSON: " + member["id"] ) c.add_member(member_name, role=member["position"]) yield c
def scrape_joint_committee(self, committee_name, url): if "state.tn.us" in url: com = Organization(committee_name, chamber="legislature", classification="committee") try: page = self.get(url).text except requests.exceptions.ConnectionError: self.logger.warning("Committee link is broken, skipping") return page = lxml.html.fromstring(page) for el in page.xpath( "//div[@class='Blurb']/table//tr[2 <= position() and position() < 10]/td[1]" ): if el.xpath("text()") == ["Vacant"]: continue (member_name, ) = el.xpath("a/text()") if el.xpath("text()"): role = el.xpath("text()")[0].strip(" ,") else: role = "member" member_name = member_name.replace("Senator", "") member_name = member_name.replace("Representative", "") member_name = member_name.strip() com.add_member(member_name, role) com.add_link(url) com.add_source(url) return com elif "gov-opps" in url: com = Organization(committee_name, chamber="legislature", classification="committee") page = self.get(url).text page = lxml.html.fromstring(page) links = ["senate", "house"] for link in links: chamber_link = self.base_href + "/" + link + "/committees/gov-opps.html" chamber_page = self.get(chamber_link).text chamber_page = lxml.html.fromstring(chamber_page) OFFICER_SEARCH = ( '//h2[contains(text(), "Committee Officers")]/' "following-sibling::div/ul/li/a") MEMBER_SEARCH = ('//h2[contains(text(), "Committee Members")]/' "following-sibling::div/ul/li/a") for a in chamber_page.xpath( OFFICER_SEARCH) + chamber_page.xpath(MEMBER_SEARCH): member_name = " ".join( [x.strip() for x in a.xpath(".//text()") if x.strip()]) role = a.xpath("small") if role: role = role[0].xpath("text()")[0].strip() member_name = member_name.replace(role, "").strip() else: role = "member" com.add_member(member_name, role) com.add_source(chamber_link) com.add_link(url) com.add_source(url) return com else: return self._scrape_committee(committee_name, url, "legislature")