def test_bill_sponsor_by_identifier():
    create_jurisdiction()
    org = create_org()

    bill = ScrapeBill(
        "HB 1", "1900", "Axe & Tack Tax Act", classification="tax bill", chamber="lower"
    )
    bill.add_sponsorship_by_identifier(
        name="SNODGRASS",
        classification="sponsor",
        entity_type="person",
        primary=True,
        identifier="TOTALLY_REAL_ID",
        scheme="TOTALLY_REAL_SCHEME",
    )

    oi = OrganizationImporter("jid")
    pi = PersonImporter("jid")

    zs = ScrapePerson(name="Zadock Snodgrass")
    zs.add_identifier(identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME")
    pi.import_data([zs.as_dict()])
    za_db = Person.objects.get()
    Membership.objects.create(person_id=za_db.id, organization_id=org.id)

    BillImporter("jid", oi, pi).import_data([bill.as_dict()])

    obj = Bill.objects.get()
    (entry,) = obj.sponsorships.all()
    assert entry.person.name == "Zadock Snodgrass"
Exemplo n.º 2
0
    def scrape_member_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
            "//div[contains(concat(' ', normalize-space(@class), ' '), "
            "' memberModule ')]"
        ):
            img = legislator.xpath(".//div[@class='thumbnail']//img")[0].attrib["src"]
            data = legislator.xpath(".//div[@class='data']")[0]
            homepage = data.xpath(".//a[@class='black']")[0]
            full_name = homepage.text_content()

            if "Vacant" in full_name:
                continue

            homepage = homepage.attrib["href"]
            party = data.xpath(".//span[@class='partyLetter']")[0].text_content()
            party = {"R": "Republican", "D": "Democratic"}[party]
            office_lines = data.xpath("child::text()")
            phone = office_lines.pop(-1)

            if re.search(r"(Leader|Whip|Speaker)", office_lines[0]):
                office_lines.pop(0)

            office = "\n".join(office_lines)
            h3 = data.xpath("./h3")
            if len(h3):
                h3 = h3[0]
                district = h3.xpath("./br")[0].tail.replace("District", "").strip()
            else:
                district = re.findall(r"\d+\.png", legislator.attrib["style"])[
                    -1
                ].split(".", 1)[0]

            full_name = re.sub(r"\s+", " ", full_name).strip()
            email = (
                "rep{0:0{width}}@ohiohouse.gov"
                if chamber == "lower"
                else "sd{0:0{width}}@ohiosenate.gov"
            ).format(int(district), width=2)

            leg = Person(
                name=full_name,
                district=district,
                party=party,
                primary_org=chamber,
                image=img,
            )

            leg.add_contact_detail(type="address", value=office, note="Capitol Office")
            leg.add_contact_detail(type="voice", value=phone, note="Capitol Office")
            leg.add_contact_detail(type="email", value=email, note="Capitol Office")

            self.scrape_homepage(leg, chamber, homepage)

            leg.add_source(url)
            leg.add_link(homepage)
            yield leg
def test_no_membership_for_person_including_party():
    """
    even though party is specified we should still get a no memberships error because it doesn't
    bind the person to a jurisdiction, thus causing duplication
    """
    create_jurisdiction()
    Organization.objects.create(
        id="fnd",
        name="Foundation",
        classification="foundation",
        jurisdiction_id="fnd-jid",
    )
    Organization.objects.create(id="dem",
                                name="Democratic",
                                classification="party")

    # import a person with no memberships
    p = ScrapePerson("a man without a country", party="Democratic")
    person_imp = PersonImporter("fnd-jid")
    org_imp = OrganizationImporter("fnd-jid")
    person_imp.import_data([p.as_dict()])

    # try to import a membership
    dumb_imp = DumbMockImporter()
    memimp = MembershipImporter("fnd-jid", person_imp, org_imp, dumb_imp)

    with pytest.raises(NoMembershipsError):
        memimp.import_data([p._related[0].as_dict()])
Exemplo n.º 4
0
    def scrape_lower(self, chamber):
        url = "http://www.house.mi.gov/mhrpublic/frmRepList.aspx"
        table = ["website", "district", "name", "party", "location", "phone", "email"]

        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        # skip two rows at top
        for row in doc.xpath('//table[@id="grvRepInfo"]/*'):
            tds = row.xpath(".//td")
            if len(tds) == 0:
                continue
            metainf = {}
            for i in range(0, len(table)):
                metainf[table[i]] = tds[i]
            district = str(int(metainf["district"].text_content().strip()))
            party = metainf["party"].text_content().strip()
            phone = metainf["phone"].text_content().strip()
            email = metainf["email"].text_content().strip()
            name = metainf["name"].text_content().strip()
            if name == "Vacant" or re.match(r"^District \d{1,3}$", name):
                self.warning(
                    "District {} appears vacant, and will be skipped".format(district)
                )
                continue
            leg_url = metainf["website"].xpath("./a")[0].attrib["href"]

            office = metainf["location"].text_content().strip()
            office = re.sub(
                " HOB",
                " Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933",
                office,
            )
            office = re.sub(" CB", " State Capitol Building\nLansing, MI 48909", office)

            try:
                photo_url = self.get_photo_url(leg_url)[0]
            except (scrapelib.HTTPError, IndexError):
                photo_url = ""
                self.warning("no photo url for %s", name)

            person = Person(
                name=name,
                district=district,
                party=abbr[party],
                primary_org="lower",
                image=photo_url,
            )

            person.add_link(leg_url)
            person.add_source(leg_url)

            person.add_contact_detail(
                type="address", value=office, note="Capitol Office"
            )
            person.add_contact_detail(type="voice", value=phone, note="Capitol Office")
            person.add_contact_detail(type="email", value=email, note="Capitol Office")

            yield person
def test_deduplication_no_name_overlap():
    create_jurisdiction()
    create_person()
    # make sure we're not just being ridiculous and avoiding importing anything in the same org
    person = ScrapePerson("CM Punk")
    pd = person.as_dict()
    PersonImporter("jid").import_data([pd])
    assert Person.objects.all().count() == 2
def test_person_add_party():
    p = Person("Groot")
    p.add_party("Green")
    p._related[0].validate()
    assert get_pseudo_id(p._related[0].organization_id) == {
        "name": "Green",
        "classification": "party",
    }
def test_deduplication_other_name_exists():
    create_jurisdiction()
    create_person()
    # Rocky is already saved in other_names
    person = ScrapePerson("Rocky")
    pd = person.as_dict()
    PersonImporter("jid").import_data([pd])
    assert Person.objects.all().count() == 1
def test_deduplication_same_name():
    create_jurisdiction()
    create_person()
    # simplest case- just the same name
    person = ScrapePerson("Dwayne Johnson")
    pd = person.as_dict()
    PersonImporter("jid").import_data([pd])
    assert Person.objects.all().count() == 1
def test_deduplication_no_jurisdiction_overlap():
    create_jurisdiction()
    create_person()
    # make sure we get a new person if we're in a different org
    person = ScrapePerson("Dwayne Johnson")
    pd = person.as_dict()
    PersonImporter("new-jurisdiction-id").import_data([pd])
    assert Person.objects.all().count() == 2
Exemplo n.º 10
0
def test_invalid_fields_related_item():
    p1 = ScrapePerson("Dwayne")
    p1.add_link("http://example.com")
    p1 = p1.as_dict()
    p1["links"][0]["test"] = 3

    with pytest.raises(DataImportError):
        PersonImporter("jid").import_data([p1])
Exemplo n.º 11
0
    def scrape_rep(self, url):

        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        main = page.xpath('//div[@id="main-info"]')[0]
        if "Resigned" in main.text_content():
            print("Member resigned {}".format(url))
            raise StopIteration  # don't yield anything

        if "Deceased" in main.text_content():
            print("Member is deceased {}".format(url))
            raise StopIteration  # don't yield anything

        name = page.xpath('//div[@class="member-name"]/text()')[0].strip()
        name = re.sub(r"\s+", " ", name)
        district_number = page.xpath(
            '//span[contains(text(), "House District:")]'
            "/following-sibling::span/text()")[0].strip()
        # remove anything after first whitespace
        district_number = re.sub(r"\s.*", "", district_number.strip())

        email = None
        email_content = page.xpath(
            '//a[./i[contains(@class,"fa-envelope")]]/text()')
        if email_content and email_content[0].strip():
            email = email_content[0].strip()

        photo_url = page.xpath('//header[@id="home"]/img/@src')[0]

        party = self.get_rep_table_by_header(page,
                                             "Party Affiliation").text.strip()
        party = _party_map[party[0]]  # standardize

        main_p_text = page.xpath('//div[@id="main-info"]/p/text()')
        address = [t.strip() for t in main_p_text if t.strip()][0]

        person = Person(
            name=name,
            district=district_number,
            primary_org="lower",
            party=party,
            image=photo_url,
        )

        person.add_contact_detail(type="address",
                                  value=address,
                                  note="District Office")
        if email:
            person.add_contact_detail(type="email",
                                      value=email,
                                      note="District Office")

        person.add_link(url)
        person.add_source(url)

        yield person
Exemplo n.º 12
0
    def scrape_senator_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
            "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), "
            "' portraitContainer ')]"
        ):
            img = legislator.xpath(
                ".//div[@class='profileThumbnailBoundingBox']/@style"
            )[0]
            img = img[img.find("(") + 1 : img.find(")")]
            full_name = legislator.xpath(".//div[@class='profileName']/a/text()")[0]
            homepage_url = legislator.xpath(".//a[@class='profileImageLink']")[
                0
            ].attrib["href"]
            district = legislator.xpath(".//div[@class='profileDistrict']" "/a/text()")[
                0
            ].split("#")[1]

            if "Vacant" in full_name:
                continue

            homepage = self.get(homepage_url).text
            page = lxml.html.fromstring(homepage)
            phone = page.xpath("//div[@class='phone']/span/text()")[0]

            address_lines = page.xpath("//div[@class='address']/descendant::*/text()")
            address = "\n".join(address_lines)

            party_image = page.xpath('//div[@class="senatorParty"]/img/@src')[0]
            if "Republican" in party_image:
                party = "Republican"
            elif "Democrat" in party_image:
                party = "Democratic"

            email = (
                "rep{0:0{width}}@ohiohouse.gov"
                if chamber == "lower"
                else "sd{0:0{width}}@ohiosenate.gov"
            ).format(int(district), width=2)

            leg = Person(
                name=full_name,
                district=district,
                primary_org=chamber,
                image=img,
                party=party,
            )

            leg.add_contact_detail(type="address", value=address, note="Capitol Office")
            leg.add_contact_detail(type="voice", value=phone, note="Capitol Office")
            leg.add_contact_detail(type="email", value=email, note="Capitol Office")

            leg.add_source(url)
            leg.add_link(homepage_url)
            yield leg
def test_person_add_term():
    p = Person("Eternal")
    p.add_term("eternal", "council", start_date="0001", end_date="9999")
    p._related[0].validate()
    assert get_pseudo_id(p._related[0].organization_id) == {
        "classification": "council"
    }
    assert p._related[0].start_date == "0001"
    assert p._related[0].end_date == "9999"
def test_deduplication_other_name_overlaps():
    create_jurisdiction()
    create_person()
    # Person has other_name that overlaps w/ existing name
    person = ScrapePerson("The Rock")
    person.add_name("Dwayne Johnson")
    pd = person.as_dict()
    PersonImporter("jid").import_data([pd])
    assert Person.objects.all().count() == 1
Exemplo n.º 15
0
def test_full_vote_event():
    j = create_jurisdiction()
    j.legislative_sessions.create(name="1900", identifier="1900")
    sp1 = ScrapePerson("John Smith", primary_org="lower")
    sp2 = ScrapePerson("Adam Smith", primary_org="lower")
    org = ScrapeOrganization(name="House", classification="lower")
    bill = ScrapeBill("HB 1",
                      "1900",
                      "Axe & Tack Tax Act",
                      from_organization=org._id)
    vote_event = ScrapeVoteEvent(
        legislative_session="1900",
        motion_text="passage",
        start_date="1900-04-01",
        classification="passage:bill",
        result="pass",
        bill_chamber="lower",
        bill="HB 1",
        organization=org._id,
    )
    vote_event.set_count("yes", 20)
    vote_event.yes("John Smith")
    vote_event.no("Adam Smith")

    oi = OrganizationImporter("jid")
    oi.import_data([org.as_dict()])

    pi = PersonImporter("jid")
    pi.import_data([sp1.as_dict(), sp2.as_dict()])

    mi = MembershipImporter("jid", pi, oi, DumbMockImporter())
    mi.import_data([sp1._related[0].as_dict(), sp2._related[0].as_dict()])

    bi = BillImporter("jid", oi, pi)
    bi.import_data([bill.as_dict()])

    VoteEventImporter("jid", pi, oi, bi).import_data([vote_event.as_dict()])

    assert VoteEvent.objects.count() == 1
    ve = VoteEvent.objects.get()
    assert ve.legislative_session == LegislativeSession.objects.get()
    assert ve.motion_classification == ["passage:bill"]
    assert ve.bill == Bill.objects.get()
    count = ve.counts.get()
    assert count.option == "yes"
    assert count.value == 20
    votes = list(ve.votes.all())
    assert len(votes) == 2
    for v in ve.votes.all():
        if v.voter_name == "John Smith":
            assert v.option == "yes"
            assert v.voter == Person.objects.get(name="John Smith")
        else:
            assert v.option == "no"
            assert v.voter == Person.objects.get(name="Adam Smith")
def test_legislator_related_party():
    leg = Person("John Adams", party="Democratic-Republican")
    leg.pre_save("jurisdiction-id")

    # a party membership
    assert len(leg._related) == 1
    assert leg._related[0].person_id == leg._id
    assert get_pseudo_id(leg._related[0].organization_id) == {
        "classification": "party",
        "name": "Democratic-Republican",
    }
    assert leg._related[0].role == "member"
def test_person_add_membership_name():
    p = Person("Leonardo DiCaprio")
    p.add_membership("Academy of Motion Picture Arts and Sciences",
                     role="winner",
                     start_date="2016")
    p._related[0].validate()
    assert get_pseudo_id(p._related[0].organization_id) == {
        "name": "Academy of Motion Picture Arts and Sciences"
    }
    assert p._related[0].person_id == p._id
    assert p._related[0].role == "winner"
    assert p._related[0].start_date == "2016"
def test_basic_invalid_person():
    bob = Person("Bob B. Johnson")
    bob.add_source(url="http://example.com")
    bob.validate()

    bob.name = None

    with pytest.raises(ScrapeValueError):
        bob.validate()
def test_legislator_related_district():
    leg = Person("John Adams", district="1", primary_org="legislature")
    leg.pre_save("jurisdiction-id")

    assert len(leg._related) == 1
    assert leg._related[0].person_id == leg._id
    assert get_pseudo_id(leg._related[0].organization_id) == {
        "classification": "legislature"
    }
    assert get_pseudo_id(leg._related[0].post_id) == {
        "organization__classification": "legislature",
        "label": "1",
    }
Exemplo n.º 20
0
def test_save_object_basics():
    # ensure that save object dumps a file
    s = Scraper(juris, "/tmp/")
    p = Person("Michael Jordan")
    p.add_source("http://example.com")

    with mock.patch("json.dump") as json_dump:
        s.save_object(p)

    # ensure object is saved in right place
    filename = "person_" + p._id + ".json"
    assert filename in s.output_names["person"]
    json_dump.assert_called_once_with(p.as_dict(), mock.ANY, cls=mock.ANY)
Exemplo n.º 21
0
    def handle_list_item(self, item):
        name = " ".join(item.xpath(".//text()"))
        name = re.sub(r"\s+", " ", name).replace(" ,", ",").strip()

        if "Vacant" in name:
            return

        district = item.xpath("string(../../td[1])")
        party = item.xpath("string(../../td[2])")
        if party == "Democrat":
            party = "Democratic"

        leg_url = item.get("href")

        name = fix_name(name)
        leg = Person(
            name=name,
            district=district,
            party=party,
            primary_org="upper",
            role="Senator",
        )
        leg.add_link(leg_url)
        leg.add_source(self.url)
        leg.add_source(leg_url)

        self.scrape_page(SenDetail, leg_url, obj=leg)

        return leg
def test_multiple_orgs_of_same_class():
    """
    We should be able to set memberships on organizations with the
    same classification within the same jurisdictions
    """
    create_jurisdiction()
    Organization.objects.create(
        id="fnd",
        name="Foundation",
        classification="foundation",
        jurisdiction_id="fnd-jid",
    )
    Organization.objects.create(
        id="fdr",
        name="Federation",
        classification="foundation",
        jurisdiction_id="fnd-jid",
    )

    hari = ScrapePerson(
        "Hari Seldon",
        primary_org="foundation",
        role="founder",
        primary_org_name="Foundation",
    )

    picard = ScrapePerson(
        "Jean Luc Picard",
        primary_org="foundation",
        role="founder",
        primary_org_name="Federation",
    )

    person_imp = PersonImporter("fnd-jid")
    person_imp.import_data([hari.as_dict()])
    person_imp.import_data([picard.as_dict()])

    # try to import a membership
    org_imp = OrganizationImporter("fnd-jid")
    dumb_imp = DumbMockImporter()
    memimp = MembershipImporter("fnd-jid", person_imp, org_imp, dumb_imp)

    memimp.import_data(
        [hari._related[0].as_dict(), picard._related[0].as_dict()])

    assert (Person.objects.get(
        name="Hari Seldon").memberships.get().organization.name == "Foundation"
            )
    assert (Person.objects.get(name="Jean Luc Picard").memberships.get().
            organization.name == "Federation")
Exemplo n.º 23
0
def test_save_related():
    s = Scraper(juris, "/tmp/")
    p = Person("Michael Jordan")
    p.add_source("http://example.com")
    o = Organization("Chicago Bulls", classification="committee")
    o.add_source("http://example.com")
    p._related.append(o)

    with mock.patch("json.dump") as json_dump:
        s.save_object(p)

    assert json_dump.mock_calls == [
        mock.call(p.as_dict(), mock.ANY, cls=mock.ANY),
        mock.call(o.as_dict(), mock.ANY, cls=mock.ANY),
    ]
Exemplo n.º 24
0
def test_save_object_invalid():
    s = Scraper(juris, "/tmp/")
    p = Person("Michael Jordan")
    # no source, won't validate

    with pytest.raises(ValueError):
        s.save_object(p)
def test_multiple_memberships():
    create_jurisdiction()
    # there was a bug where two or more memberships to the same jurisdiction
    # would cause an ORM error, this test ensures that it is fixed
    p = Person.objects.create(name="Dwayne Johnson")
    o = Organization.objects.create(name="WWE", jurisdiction_id="jid")
    Membership.objects.create(person=p, organization=o)
    o = Organization.objects.create(name="WWF", jurisdiction_id="jid")
    Membership.objects.create(person=p, organization=o)

    person = ScrapePerson("Dwayne Johnson")
    pd = person.as_dict()
    PersonImporter("jid").import_data([pd])

    # deduplication should still work
    assert Person.objects.all().count() == 1
def test_committee_add_member_person():
    c = Organization("Defense", classification="committee")
    p = Person("John Adams")
    c.add_member(p, role="chairman")
    assert c._related[0].person_id == p._id
    assert c._related[0].organization_id == c._id
    assert c._related[0].role == "chairman"
def test_legislator_related_chamber_district_role():
    leg = Person("John Adams",
                 district="1",
                 primary_org="lower",
                 role="Speaker")
    leg.pre_save("jurisdiction-id")

    assert len(leg._related) == 1
    assert leg._related[0].person_id == leg._id
    assert get_pseudo_id(leg._related[0].organization_id) == {
        "classification": "lower"
    }
    assert get_pseudo_id(leg._related[0].post_id) == {
        "organization__classification": "lower",
        "label": "1",
        "role": "Speaker",
    }
    assert leg._related[0].role == "Speaker"
def test_no_membership_for_person():
    create_jurisdiction()
    Organization.objects.create(
        id="fnd",
        name="Foundation",
        classification="foundation",
        jurisdiction_id="fnd-jid",
    )

    # import a person with no memberships
    p = ScrapePerson("a man without a country")
    person_imp = PersonImporter("fnd-jid")
    person_imp.import_data([p.as_dict()])

    # try to import a membership
    dumb_imp = DumbMockImporter()
    memimp = MembershipImporter("fnd-jid", person_imp, dumb_imp, dumb_imp)

    with pytest.raises(NoMembershipsError):
        memimp.import_data([])
Exemplo n.º 29
0
    def handle_list_item(self, item):
        photo_url = item.xpath("./img/@src")[0]
        url = item.xpath(".//h5/a/@href")[0]
        name_text = item.xpath(".//h5/a/b/text()")[0]

        name_match = re.match(r"^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$", name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip("0").upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [
            x.strip() for x in item.xpath("./div/text()[normalize-space()]")
            if x.strip()
        ]
        address = "\n".join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_text = item.xpath(".//a/@href")[1].replace("mailto:", "").strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(
            name=name,
            district=district,
            party=party,
            primary_org="lower",
            role="Representative",
            image=photo_url,
        )
        rep.add_link(url)
        rep.add_contact_detail(type="address", value=address, note="capitol")
        rep.add_contact_detail(type="voice", value=phone, note="capitol")
        rep.add_contact_detail(type="email", value=email, note="capitol")
        rep.add_source(self.url)

        yield rep
Exemplo n.º 30
0
    def scrape_member(self, chamber, link):
        name = link.text.strip()
        leg_url = link.get("href")
        district = link.xpath("string(../../td[3])")
        party = link.xpath("string(../../td[4])")

        # we get email on the next page now
        # email = link.xpath("string(../../td[5])")

        if party == "Democrat":
            party = "Democratic"
        elif party == "No Party Specified":
            party = "Independent"

        pid = re.search(r"personID=(\d+)", link.attrib["href"]).group(1)
        photo_url = ("https://www.legis.iowa.gov/photo"
                     "?action=getPhoto&ga=%s&pid=%s" %
                     (self.latest_session(), pid))

        leg = Person(
            name=name,
            primary_org=chamber,
            district=district,
            party=party,
            image=photo_url,
        )

        leg.add_link(leg_url)
        leg.add_source(leg_url)

        leg_page = lxml.html.fromstring(self.get(link.attrib["href"]).text)
        self.scrape_member_page(leg, leg_page)
        yield leg