def test_full_organization(): create_jurisdictions() org = ScrapeOrganization("United Nations", classification="international") org.add_identifier("un") org.add_name("UN", start_date="1945") org.add_contact_detail(type="phone", value="555-555-1234", note="this is fake") org.add_link("http://example.com/link") org.add_source("http://example.com/source") # import org od = org.as_dict() OrganizationImporter("jid1").import_data([od]) # get person from db and assert it imported correctly o = Organization.objects.get() assert "ocd-organization" in o.id assert o.name == org.name assert o.identifiers.all()[0].identifier == "un" assert o.identifiers.all()[0].scheme == "" assert o.other_names.all()[0].name == "UN" assert o.other_names.all()[0].start_date == "1945" assert o.contact_details.all()[0].type == "phone" assert o.contact_details.all()[0].value == "555-555-1234" assert o.contact_details.all()[0].note == "this is fake" assert o.links.all()[0].url == "http://example.com/link" assert o.sources.all()[0].url == "http://example.com/source"
def scrape_committees(self, chamber): url = _COMMITTEE_URL % _CHAMBERS[chamber] page = self.get(url).text html = lxml.html.fromstring(page) table = html.xpath( "body/section[2]/div/div/section[2]/div[2]/div/div/div/div") for row in table[1:]: # committee name, description, hours of operation, # secretary and office_phone text = list(row[0].xpath("div")[0].itertext()) attributes = [ list( value.replace(u"\xa0", " ").replace( "Secretary:", "").encode("ascii", "ignore") for value in text if "Email:" not in value and value != "\n" and "Phone:" not in value) ] for i in range(len(attributes[0])): if "Room" in str(attributes[0][i]): attributes[0][i] = (str( attributes[0][i]).split("Room")[0].replace(", ", " ")) org = Organization( chamber=chamber, classification="committee", name=str(attributes[0][0].decode()), ) if len(attributes[0]) > 5: org.add_contact_detail( type="email", value=str(attributes[0][4].decode()), note="District Office", ) org.add_contact_detail( type="voice", value=str(attributes[0][5].decode()), note="District Office", ) else: org.add_contact_detail( type="email", value=str(attributes[0][3].decode()), note="District Office", ) org.add_contact_detail( type="voice", value=str(attributes[0][4].decode()), note="District Office", ) org.add_source(url) # membership td_text = list() for td in row[1].xpath("div") + row[2].xpath("div"): td_text += td.itertext() members = list(value for value in td_text if value != " " and value != "\n" and value != ",") role = "member" for member in members: if member in ["Chair", "Vice Chair"]: role = member.lower() continue elif member.strip(): org.add_member(member.strip(), role=role) role = "member" yield org