def test_full_person():
    person = ScrapePerson("Tom Sawyer")
    person.add_identifier("1")
    person.add_name("Tommy", start_date="1880")
    person.add_contact_detail(type="phone",
                              value="555-555-1234",
                              note="this is fake")
    person.add_link("http://example.com/link")
    person.add_source("http://example.com/source")

    # import person
    pd = person.as_dict()
    PersonImporter("jid").import_data([pd])

    # get person from db and assert it imported correctly
    p = Person.objects.get()
    assert "ocd-person" in p.id
    assert p.name == person.name

    assert p.identifiers.all()[0].identifier == "1"
    assert p.identifiers.all()[0].scheme == ""

    assert p.other_names.all()[0].name == "Tommy"
    assert p.other_names.all()[0].start_date == "1880"

    assert p.contact_details.all()[0].type == "phone"
    assert p.contact_details.all()[0].value == "555-555-1234"
    assert p.contact_details.all()[0].note == "this is fake"

    assert p.links.all()[0].url == "http://example.com/link"
    assert p.sources.all()[0].url == "http://example.com/source"
def test_deduplication_other_name_overlaps():
    create_jurisdiction()
    create_person()
    # Person has other_name that overlaps w/ existing name
    person = ScrapePerson("The Rock")
    person.add_name("Dwayne Johnson")
    pd = person.as_dict()
    PersonImporter("jid").import_data([pd])
    assert Person.objects.all().count() == 1
def test_same_name_people_other_name():
    create_jurisdiction()
    # ensure we're taking other_names into account for the name collision code
    Organization.objects.create(name="WWE", jurisdiction_id="jid")
    p1 = ScrapePerson("Dwayne Johnson", image="http://example.com/1")
    p2 = ScrapePerson("Rock", image="http://example.com/2")
    p2.add_name("Dwayne Johnson")

    # the people have the same name but are apparently different
    with pytest.raises(SameNameError):
        PersonImporter("jid").import_data([p1.as_dict(), p2.as_dict()])
Exemplo n.º 4
0
    def scrape_member(self, chamber, member_url):
        page = self.get(member_url).text
        root = lxml.html.fromstring(page)

        name_and_party = root.xpath('string(//div[@class="col-md-12"]/h1[1])').split()

        title = name_and_party[0]
        # Account for Representative-Elect and Senator-Elect, for incoming class
        if title.startswith("Representative"):
            chamber = "lower"
        elif title.startswith("Senator"):
            chamber = "upper"

        full_name = " ".join(name_and_party[1:-1])

        party = name_and_party[-1]

        if party == "(R)":
            party = "Republican"
        elif party == "(D)":
            party = "Democratic"
        elif party == "(G)":
            party = "Green"
        elif party == "(I)":
            party = "Independent"
        elif "-Elect" in title and not party.startswith("("):
            self.warning("Member-elect is currently missing a party")
            full_name = " ".join(name_and_party[1:])
            party = ""
        else:
            raise AssertionError("Unknown party ({0}) for {1}".format(party, full_name))

        try:
            img = root.xpath('//img[@class="SitePhotos MemberPhoto"]')[0]
            photo_url = "https://www.arkleg.state.ar.us" + img.attrib["src"]
        except IndexError:
            self.warning("No member photo found")
            photo_url = ""

        district = root.xpath(
            "(//b[text()='District:'])[2]/parent::div/parent::div/div[3]"
        )[0].text_content()

        person = Person(
            name=full_name,
            district=district,
            party=party,
            primary_org=chamber,
            image=photo_url,
        )

        person.add_link(member_url)
        person.add_source(member_url)

        vote_name_path = member_url
        first_split = vote_name_path.split("=")[1]
        second_split = first_split.split("&")[0]
        vote_name = second_split.replace("+", " ")
        person.add_name(vote_name)

        try:
            phone = root.xpath(
                'string(//div[@id="bodyContent"]/div[2]/div[2]/div[1]/div[3])'
            )
            if not phone.strip():
                raise AttributeError
        except AttributeError:
            phone = None
        try:
            email = root.xpath(
                'string(//div[@id="bodyContent"]/div[2]/div[2]/div[2]/div[3])'
            )
            if not email.strip():
                raise AttributeError
        except AttributeError:
            email = None
        address = root.xpath('string(//div[@id="bodyContent"]/div[1]/div[1]/p/b)')
        address_list = list(address)
        address_list.insert(len(address_list) - 5, "AR ")
        address_list = "".join(address_list)
        address = address_list

        person.add_contact_detail(type="address", value=address, note="District Office")
        if phone is not None:
            person.add_contact_detail(type="voice", value=phone, note="District Office")
        if email is not None:
            person.add_contact_detail(type="email", value=email, note="District Office")

        try:
            occupation_check = root.xpath(
                'string(//div[@id="bodyContent"]/div[2]/div[2]/div[5]/div[1]/b)'
            )
            if occupation_check == "Occupation:":
                person.extras["occupation"] = root.xpath(
                    'string(//div[@id="bodyContent"]/div[2]/div[2]/div[5]/div[3])'
                )
            else:
                raise AttributeError
            if not person.extras["occupation"].strip():
                raise AttributeError
        except AttributeError:
            pass

        yield person