def test_full_person(): person = ScrapePerson("Tom Sawyer") person.add_identifier("1") person.add_name("Tommy", start_date="1880") person.add_contact_detail(type="phone", value="555-555-1234", note="this is fake") person.add_link("http://example.com/link") person.add_source("http://example.com/source") # import person pd = person.as_dict() PersonImporter("jid").import_data([pd]) # get person from db and assert it imported correctly p = Person.objects.get() assert "ocd-person" in p.id assert p.name == person.name assert p.identifiers.all()[0].identifier == "1" assert p.identifiers.all()[0].scheme == "" assert p.other_names.all()[0].name == "Tommy" assert p.other_names.all()[0].start_date == "1880" assert p.contact_details.all()[0].type == "phone" assert p.contact_details.all()[0].value == "555-555-1234" assert p.contact_details.all()[0].note == "this is fake" assert p.links.all()[0].url == "http://example.com/link" assert p.sources.all()[0].url == "http://example.com/source"
def test_deduplication_other_name_overlaps(): create_jurisdiction() create_person() # Person has other_name that overlaps w/ existing name person = ScrapePerson("The Rock") person.add_name("Dwayne Johnson") pd = person.as_dict() PersonImporter("jid").import_data([pd]) assert Person.objects.all().count() == 1
def test_same_name_people_other_name(): create_jurisdiction() # ensure we're taking other_names into account for the name collision code Organization.objects.create(name="WWE", jurisdiction_id="jid") p1 = ScrapePerson("Dwayne Johnson", image="http://example.com/1") p2 = ScrapePerson("Rock", image="http://example.com/2") p2.add_name("Dwayne Johnson") # the people have the same name but are apparently different with pytest.raises(SameNameError): PersonImporter("jid").import_data([p1.as_dict(), p2.as_dict()])
def scrape_member(self, chamber, member_url): page = self.get(member_url).text root = lxml.html.fromstring(page) name_and_party = root.xpath('string(//div[@class="col-md-12"]/h1[1])').split() title = name_and_party[0] # Account for Representative-Elect and Senator-Elect, for incoming class if title.startswith("Representative"): chamber = "lower" elif title.startswith("Senator"): chamber = "upper" full_name = " ".join(name_and_party[1:-1]) party = name_and_party[-1] if party == "(R)": party = "Republican" elif party == "(D)": party = "Democratic" elif party == "(G)": party = "Green" elif party == "(I)": party = "Independent" elif "-Elect" in title and not party.startswith("("): self.warning("Member-elect is currently missing a party") full_name = " ".join(name_and_party[1:]) party = "" else: raise AssertionError("Unknown party ({0}) for {1}".format(party, full_name)) try: img = root.xpath('//img[@class="SitePhotos MemberPhoto"]')[0] photo_url = "https://www.arkleg.state.ar.us" + img.attrib["src"] except IndexError: self.warning("No member photo found") photo_url = "" district = root.xpath( "(//b[text()='District:'])[2]/parent::div/parent::div/div[3]" )[0].text_content() person = Person( name=full_name, district=district, party=party, primary_org=chamber, image=photo_url, ) person.add_link(member_url) person.add_source(member_url) vote_name_path = member_url first_split = vote_name_path.split("=")[1] second_split = first_split.split("&")[0] vote_name = second_split.replace("+", " ") person.add_name(vote_name) try: phone = root.xpath( 'string(//div[@id="bodyContent"]/div[2]/div[2]/div[1]/div[3])' ) if not phone.strip(): raise AttributeError except AttributeError: phone = None try: email = root.xpath( 'string(//div[@id="bodyContent"]/div[2]/div[2]/div[2]/div[3])' ) if not email.strip(): raise AttributeError except AttributeError: email = None address = root.xpath('string(//div[@id="bodyContent"]/div[1]/div[1]/p/b)') address_list = list(address) address_list.insert(len(address_list) - 5, "AR ") address_list = "".join(address_list) address = address_list person.add_contact_detail(type="address", value=address, note="District Office") if phone is not None: person.add_contact_detail(type="voice", value=phone, note="District Office") if email is not None: person.add_contact_detail(type="email", value=email, note="District Office") try: occupation_check = root.xpath( 'string(//div[@id="bodyContent"]/div[2]/div[2]/div[5]/div[1]/b)' ) if occupation_check == "Occupation:": person.extras["occupation"] = root.xpath( 'string(//div[@id="bodyContent"]/div[2]/div[2]/div[5]/div[3])' ) else: raise AttributeError if not person.extras["occupation"].strip(): raise AttributeError except AttributeError: pass yield person