Пример #1
0
def table_row_to_legislator_and_profile_url(table_row_element, chamber):
    """Derive a Legislator from an HTML table row lxml Element, and a link to their profile"""
    td_elements = table_row_element.xpath('td')
    (role_element, name_element, district_element, party_element,
     phone_element, email_element) = td_elements

    # Name comes in the form Last, First
    # last_name_first_name = name_element.text_content().strip()
    # full_name = last_name_first_name_to_full_name(last_name_first_name)
    full_name = name_element.text_content().strip()
    district = district_element.text_content().strip()
    party = party_element.text_content().strip()
    if party == 'Democrat':
        party = 'Democratic'

    role = role_element.text_content().strip()
    address = co_address_from_role(role)
    phone = phone_element.text_content().strip()
    email = email_element.text_content().strip()

    (profile_url, ) = name_element.xpath('a/@href')
    print(chamber, district, party)
    legislator = Person(primary_org=chamber,
                        name=full_name,
                        district=district,
                        party=party)
    legislator.add_contact_detail(type='address', value=address, note='Capitol Office')
    legislator.add_contact_detail(type='voice', value=phone, note='Capitol Office')
    legislator.add_contact_detail(type='email', value=email, note='Capitol Office')

    return legislator, profile_url
Пример #2
0
def test_deduplication_no_jurisdiction_overlap():
    create_person()
    # make sure we get a new person if we're in a different org
    person = ScrapePerson('Dwayne Johnson')
    pd = person.as_dict()
    PersonImporter('new-jurisdiction-id').import_data([pd])
    assert Person.objects.all().count() == 2
Пример #3
0
def test_deduplication_no_name_overlap():
    create_person()
    # make sure we're not just being ridiculous and avoiding importing anything in the same org
    person = ScrapePerson('CM Punk')
    pd = person.as_dict()
    PersonImporter('jurisdiction-id').import_data([pd])
    assert Person.objects.all().count() == 2
Пример #4
0
    def scrape_legislator(self, chamber, name, url):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        district = page.xpath('//h1[contains(., "DISTRICT")]/text()').pop() \
            .split()[1].strip().lstrip('0')

        party = page.xpath('//h2').pop().text_content()
        party = re.search(r'\((R|D|I)[ \-\]]', party).group(1)

        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'
        elif party == 'I':
            party = 'Independent'

        photo_url = page.xpath(
            "//img[contains(@src, 'images/members/')]")[0].attrib['src']

        leg = Person(name, district=district, party=party, image=photo_url, primary_org=chamber)
        leg.add_link(url)
        leg.add_source(url)
        self.scrape_offices(leg, page)

        yield leg
Пример #5
0
    def get_council(self):
        council_doc = self.lxmlize(self.COUNCIL_URL)

        member_urls = council_doc.xpath(
            '//table[@summary="City Directory"]/tr//'
            'a[contains(@href, "/directory.aspx?EID=")]/@href')
        for member_url in member_urls:
            member_doc = self.lxmlize(member_url)

            (name, ) = member_doc.xpath('//h1[@class="BioName"]/text()')
            (name, ) = re.findall(r'^(?:Mr\.|Mrs\.|Hon\.)?\s*(.*?)\s*$', name)

            # Returning everything into a list because the number of values returned varies 
            # depending on if the person has an email or not
            text_list = member_doc.xpath(
                '//a[@class="BioLink"]/parent::div/text()')
            title = text_list[1].strip()
            (title, ) = re.findall(
                r'^Title: (Council Member,?(?: Ward \d)|Mayor)\s*$', title)

            try:
                (image_url, ) = member_doc.xpath(
                    '//span[@class="BioText"]//img/@src')
            except ValueError:
                image_url = ''

            member = Person(name=name,
                            image=image_url,
                            primary_org='legislature',
                            role=title)

            member.add_source(member_url)

            yield member
Пример #6
0
    def get_council(self):
        council_doc = self.lxmlize(self.COUNCIL_URL)

        member_urls = council_doc.xpath(
            '//table[@summary="City Directory"]/tr//' 'a[contains(@href, "/directory.aspx?EID=")]/@href'
        )
        for member_url in member_urls:
            member_doc = self.lxmlize(member_url)

            (name,) = member_doc.xpath('//span[@class="BioName"]/span/text()')
            (name,) = re.findall(r"^(?:Mr\.|Mrs\.|Hon\.)?\s*(.*?)\s*$", name)

            (title,) = member_doc.xpath('//a[@class="BioLink"]/following-sibling::text()')
            (title,) = re.findall(r"^Title: (Council Member(?: Ward \d)|Mayor)\s*$", title)

            try:
                (image_url,) = member_doc.xpath('//span[@class="BioText"]//img/@src')
            except ValueError:
                image_url = ""

            member = Person(name=name, image=image_url, primary_org="legislature", role=title)

            member.add_source(member_url)

            yield member
 def scrape_csv(self, reader):
     for row in reader:
         contributor = Person(
             name="{Contact First Name} {Contact Last Name}".format(**row)
         )
         contributor.add_source(SEARCH_URL)
         yield contributor
Пример #8
0
def test_bill_sponsor_by_identifier():
    create_jurisdiction()
    org = create_org()

    bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act',
                      classification='tax bill', chamber='lower')
    bill.add_sponsorship_by_identifier(name="SNODGRASS",
                                       classification='sponsor',
                                       entity_type='person',
                                       primary=True,
                                       identifier="TOTALLY_REAL_ID",
                                       scheme="TOTALLY_REAL_SCHEME")

    oi = OrganizationImporter('jid')
    pi = PersonImporter('jid')

    zs = ScrapePerson(name='Zadock Snodgrass')
    zs.add_identifier(identifier='TOTALLY_REAL_ID',
                      scheme='TOTALLY_REAL_SCHEME')
    pi.import_data([zs.as_dict()])
    za_db = Person.objects.get()
    Membership.objects.create(person_id=za_db.id,
                              organization_id=org.id)

    BillImporter('jid', oi, pi).import_data([bill.as_dict()])

    obj = Bill.objects.get()
    (entry,) = obj.sponsorships.all()
    assert entry.person.name == "Zadock Snodgrass"
Пример #9
0
def test_deduplication_other_name_exists():
    create_person()
    # Rocky is already saved in other_names
    person = ScrapePerson('Rocky')
    pd = person.as_dict()
    PersonImporter('jurisdiction-id').import_data([pd])
    assert Person.objects.all().count() == 1
Пример #10
0
    def scrape_member(self, chamber, link):
        name = link.text.strip()
        leg_url = link.get('href')
        district = link.xpath("string(../../td[3])")
        party = link.xpath("string(../../td[4])")

        # we get email on the next page now
        # email = link.xpath("string(../../td[5])")

        if party == 'Democrat':
            party = 'Democratic'
        elif party == 'No Party Specified':
            party = 'Independent'

        pid = re.search(r"personID=(\d+)", link.attrib['href']).group(1)
        photo_url = ("https://www.legis.iowa.gov/photo"
                     "?action=getPhoto&ga=%s&pid=%s" % (self.latest_session(), pid))

        leg = Person(
            name=name,
            primary_org=chamber,
            district=district,
            party=party,
            image=photo_url)

        leg.add_link(leg_url)
        leg.add_source(leg_url)

        leg_page = lxml.html.fromstring(self.get(link.attrib['href']).text)
        self.scrape_member_page(leg, leg_page)
        yield leg
Пример #11
0
def test_deduplication_same_name():
    create_person()
    # simplest case- just the same name
    person = ScrapePerson('Dwayne Johnson')
    pd = person.as_dict()
    PersonImporter('jurisdiction-id').import_data([pd])
    assert Person.objects.all().count() == 1
def test_multiple_orgs_of_same_class():
    """
    We should be able to set memberships on organizations with the
    same classification within the same jurisdictions
    """
    Organization.objects.create(id="fnd", name="Foundation", classification="foundation",
                                jurisdiction_id="fnd-jid")
    Organization.objects.create(id="fdr", name="Federation", classification="foundation",
                                jurisdiction_id="fnd-jid")

    hari = ScrapePerson('Hari Seldon',
                        primary_org='foundation',
                        role='founder',
                        primary_org_name='Foundation')

    picard = ScrapePerson('Jean Luc Picard',
                        primary_org='foundation',
                        role='founder',
                        primary_org_name='Federation')

    person_imp = PersonImporter('fnd-jid')
    person_imp.import_data([hari.as_dict()])
    person_imp.import_data([picard.as_dict()])

    # try to import a membership
    org_imp = OrganizationImporter('fnd-jid')
    dumb_imp = DumbMockImporter()
    memimp = MembershipImporter('fnd-jid', person_imp, org_imp, dumb_imp)

    memimp.import_data([hari._related[0].as_dict(), 
                        picard._related[0].as_dict()])

    assert Person.objects.get(name='Hari Seldon').memberships.get().organization.name == 'Foundation'
    assert Person.objects.get(name='Jean Luc Picard').memberships.get().organization.name == 'Federation'
Пример #13
0
    def scrape_counciler(self, url):
        page = self.lxmlize(url)
        who, = page.xpath("//h3[@class='subtitle']/text()")
        district, = page.xpath("//div[@class='right-bar']//h2/text()")
        image, = page.xpath(
            "//div[@class='left-bar']//a[@class='image lightbox']//img"
        )

        member = Person(
            primary_org='legislature',
            name=who, district=district,
            image=image.attrib['src']
        )
        member.add_source(url)

        details = page.xpath("//table[@align='center']//td")
        for detail in details:
            detail = detail.text_content().strip()
            if detail is None or detail == "":
                continue

            type_, value = detail.split(":", 1)
            cdtype = {
                "Home Phone": "voice",
                "Address": "address",
                "Email": "email",
                "Cell Phone": "voice",
            }[type_]
            member.add_contact_detail(type=cdtype,
                                      note=type_,
                                      value=value)

        yield member
Пример #14
0
    def scrape_upper_chamber(self, term):
        url = "http://oksenate.gov/Senators/Default.aspx"
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for a in doc.xpath('//table[@summary]')[0]. \
                xpath('.//td//a[contains(@href, "biographies")]'):
            tail = a.xpath('..')[0].tail
            if tail:
                district = tail.split()[1]
            else:
                district = a.xpath('../../span')[1].text.split()[1]

            if a.text is None or a.text.strip() == 'Vacant':
                self.warning("District {} appears to be empty".format(district))
                continue
            else:
                match = re.match(r'(.+) \(([A-Z])\)', a.text.strip())
                name, party = match.group(1), self._parties[match.group(2)]

            url = a.get('href')

            person = Person(primary_org='upper',
                            district=district,
                            name=name.strip(),
                            party=party,
                            )
            person.add_link(url)
            person.add_source(url)
            self.scrape_upper_offices(person, url)
            yield person
Пример #15
0
    def scrape_chamber(self, chamber):
        self._party_map = {
            'Democrat': 'Democratic',
            'Republican': 'Republican',
            'Non Affiliated': 'Independent',
            'Not Affiliated': 'Independent',
        }

        if chamber == 'upper':
            url = 'http://senate.legis.state.ak.us/'
        else:
            url = 'http://house.legis.state.ak.us/'

        page = self.lxmlize(url)

        items = page.xpath('//ul[@class="item"]')[1].getchildren()

        for item in items:
            photo_url = item.xpath('.//img/@src')[0]
            name = item.xpath('.//strong/text()')[0]
            leg_url = item.xpath('.//a/@href')[0]
            email = item.xpath('.//a[text()="Email Me"]/@href')
            if email:
                email = email[0].replace('mailto:', '')
            else:
                self.warning('no email for ' + name)

            party = district = None
            skip = False

            for dt in item.xpath('.//dt'):
                dd = dt.xpath('following-sibling::dd')[0].text_content()
                label = dt.text.strip()
                if label == 'Party:':
                    party = dd
                elif label == 'District:':
                    district = dd
                elif label.startswith('Deceased'):
                    skip = True
                    self.warning('skipping deceased ' + name)
                    break

            if skip:
                continue

            person = Person(
                primary_org=chamber,
                district=district,
                name=name,
                party=self._party_map[party],
                image=photo_url,
            )
            person.add_source(leg_url)
            person.add_link(leg_url)

            # scrape offices
            self._scrape_offices(person, leg_url, email)

            yield person
Пример #16
0
def test_deduplication_other_name_overlaps():
    create_person()
    # Person has other_name that overlaps w/ existing name
    person = ScrapePerson('The Rock')
    person.add_name('Dwayne Johnson')
    pd = person.as_dict()
    PersonImporter('jurisdiction-id').import_data([pd])
    assert Person.objects.all().count() == 1
Пример #17
0
def test_invalid_fields_related_item():
    p1 = ScrapePerson('Dwayne')
    p1.add_link('http://example.com')
    p1 = p1.as_dict()
    p1['links'][0]['test'] = 3

    with pytest.raises(DataImportError):
        PersonImporter('jid').import_data([p1])
Пример #18
0
def test_legislator_related_chamber_district():
    leg = Person('John Adams', district='1', primary_org='upper')
    leg.pre_save('jurisdiction-id')

    assert len(leg._related) == 1
    assert leg._related[0].person_id == leg._id
    assert get_pseudo_id(leg._related[0].organization_id) == {'classification': 'upper'}
    assert get_pseudo_id(leg._related[0].post_id) == {"organization__classification": "upper",
                                                      "label": "1"}
Пример #19
0
def test_person_add_term():
    p = Person('Eternal')
    p.add_term('eternal', 'council', start_date='0001', end_date='9999')
    p._related[0].validate()
    assert get_pseudo_id(p._related[0].organization_id) == {
        'classification': 'council',
    }
    assert p._related[0].start_date == '0001'
    assert p._related[0].end_date == '9999'
Пример #20
0
def test_person_add_membership():
    p = Person('Bob B. Bear')
    p.add_source('http://example.com')
    o = Organization('test org', classification='unknown')
    p.add_membership(o, role='member', start_date='2007')
    assert len(p._related) == 1
    p._related[0].validate()
    assert p._related[0].person_id == p._id
    assert p._related[0].organization_id == o._id
    assert p._related[0].start_date == '2007'
Пример #21
0
def test_person_add_membership_name():
    p = Person('Leonardo DiCaprio')
    p.add_membership('Academy of Motion Picture Arts and Sciences',
                     role='winner', start_date='2016')
    p._related[0].validate()
    assert get_pseudo_id(p._related[0].organization_id) == {
        'name': 'Academy of Motion Picture Arts and Sciences'}
    assert p._related[0].person_id == p._id
    assert p._related[0].role == 'winner'
    assert p._related[0].start_date == '2016'
Пример #22
0
def test_legislator_related_district():
    l = Person('John Adams', district='1', primary_org='legislature')
    l.pre_save('jurisdiction-id')

    assert len(l._related) == 1
    assert l._related[0].person_id == l._id
    assert get_pseudo_id(l._related[0].organization_id) == {'classification': 'legislature'}
    assert get_pseudo_id(l._related[0].post_id) == {"organization__classification": "legislature",
                                                    "label": "1", "role": "member"}
    assert l._related[0].role == 'member'
Пример #23
0
def test_legislator_related_party():
    l = Person('John Adams', party='Democratic-Republican')
    l.pre_save('jurisdiction-id')

    # a party membership
    assert len(l._related) == 1
    assert l._related[0].person_id == l._id
    assert get_pseudo_id(l._related[0].organization_id) == {'classification': 'party',
                                                            'name': 'Democratic-Republican'}
    assert l._related[0].role == 'member'
Пример #24
0
    def handle_list_item(self, item):
        photo_url = item.xpath('./img/@src')[0]
        url = item.xpath('.//h5/a/@href')[0]
        name_text = item.xpath('.//h5/a/b/text()')[0]

        name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip('0').upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [x.strip() for x in item.xpath(
            './div/text()[normalize-space()]'
        ) if x.strip()]
        address = '\n'.join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_text = item.xpath('.//a/@href')[1].replace('mailto:', '').strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(name=name, district=district, party=party,
                     primary_org='lower', role='Representative',
                     image=photo_url)
        rep.add_link(url)
        rep.add_contact_detail(type='address', value=address, note='capitol')
        rep.add_contact_detail(type='voice', value=phone, note='capitol')
        rep.add_contact_detail(type='email', value=email, note='capitol')
        rep.add_source(self.url)

        yield rep
Пример #25
0
    def scrape_lower(self, chamber):
        url = 'http://www.house.mi.gov/mhrpublic/frmRepList.aspx'
        table = [
            "website",
            "district",
            "name",
            "party",
            "location",
            "phone",
            "email"
        ]

        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        # skip two rows at top
        for row in doc.xpath('//table[@id="grvRepInfo"]/*'):
            tds = row.xpath('.//td')
            if len(tds) == 0:
                continue
            metainf = {}
            for i in range(0, len(table)):
                metainf[table[i]] = tds[i]
            district = str(int(metainf['district'].text_content().strip()))
            party = metainf['party'].text_content().strip()
            phone = metainf['phone'].text_content().strip()
            email = metainf['email'].text_content().strip()
            leg_url = metainf['website'].xpath("./a")[0].attrib['href']
            name = metainf['name'].text_content().strip()
            if name == 'Vacant' or re.match(r'^District \d{1,3}$', name):
                self.warning('District {} appears vacant, and will be skipped'.format(district))
                continue

            office = metainf['location'].text_content().strip()
            office = re.sub(
                ' HOB',
                ' Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933',
                office
            )
            office = re.sub(
                ' CB',
                ' State Capitol Building\nLansing, MI 48909',
                office
            )

            photo_url = self.get_photo_url(leg_url)
            person = Person(name=name, district=district, party=abbr[party],
                            primary_org='lower', image=photo_url[0] if photo_url else None)

            person.add_link(leg_url)
            person.add_source(leg_url)

            person.add_contact_detail(type='address', value=office, note='Capitol Office')
            person.add_contact_detail(type='voice', value=phone, note='Capitol Office')
            person.add_contact_detail(type='email', value=email, note='Capitol Office')

            yield person
Пример #26
0
def test_save_related():
    s = Scraper('jurisdiction', '/tmp/')
    p = Person('Michael Jordan')
    p.add_source('http://example.com')
    o = Organization('Chicago Bulls')
    o.add_source('http://example.com')
    p._related.append(o)

    with mock.patch('json.dump') as json_dump:
        s.save_object(p)

    assert json_dump.mock_calls == [mock.call(p.as_dict(), mock.ANY, cls=mock.ANY),
                                    mock.call(o.as_dict(), mock.ANY, cls=mock.ANY)]
Пример #27
0
def test_save_object_basics():
    # ensure that save object dumps a file
    s = Scraper('jurisdiction', '/tmp/')
    p = Person('Michael Jordan')
    p.add_source('http://example.com')

    with mock.patch('json.dump') as json_dump:
        s.save_object(p)

    # ensure object is saved in right place
    filename = 'person_' + p._id + '.json'
    assert filename in s.output_names['person']
    json_dump.assert_called_once_with(p.as_dict(), mock.ANY, cls=mock.ANY)
Пример #28
0
    def scrape_member_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
                "//div[contains(concat(' ', normalize-space(@class), ' '), "
                "' memberModule ')]"
                ):
            img = legislator.xpath(
                ".//div[@class='thumbnail']//img")[0].attrib['src']
            data = legislator.xpath(".//div[@class='data']")[0]
            homepage = data.xpath(".//a[@class='black']")[0]
            full_name = homepage.text_content()

            if "Vacant" in full_name:
                continue

            homepage = homepage.attrib['href']
            party = data.xpath(
                ".//span[@class='partyLetter']")[0].text_content()
            party = {"R": "Republican", "D": "Democratic"}[party]
            office_lines = data.xpath("child::text()")
            phone = office_lines.pop(-1)
            office = "\n".join(office_lines)
            h3 = data.xpath("./h3")
            if len(h3):
                h3 = h3[0]
                district = h3.xpath("./br")[0].tail.replace("District", ""
                                                            ).strip()
            else:
                district = re.findall(
                    r"\d+\.png",
                    legislator.attrib['style']
                )[-1].split(".", 1)[0]

            full_name = re.sub(r"\s+", " ", full_name).strip()
            email = (
                'rep{0:0{width}}@ohiohouse.gov'
                if chamber == 'lower' else
                'sd{0:0{width}}@ohiosenate.gov'
            ).format(int(district), width=2)

            leg = Person(name=full_name, district=district,
                         party=party, primary_org=chamber,
                         image=img)

            leg.add_contact_detail(type='address', value=office, note='Capitol Office')
            leg.add_contact_detail(type='voice', value=phone, note='Capitol Office')
            leg.add_contact_detail(type='email', value=email, note='Capitol Office')

            self.scrape_homepage(leg, chamber, homepage)

            leg.add_source(url)
            leg.add_link(homepage)
            yield leg
Пример #29
0
	def scrape_alderman(self, ward_num):
		ward_url = "{}/ward-{}".format(Utils.ALDERMEN_HOME, ward_num)
		alderman_url = self.alderman_url(ward_url)
		alderman_page = self.lxmlize(alderman_url)

		# person's name is the only <h1> tag on the page
		name = alderman_page.xpath("//h1/text()")[0]

		# initialize person object with appropriate data so that pupa can 
		# automatically create a membership object linking this person to
		# a post in the jurisdiction's "Board of Aldermen" organization
		district = "Ward {} Alderman".format(ward_num)
		person = Person(name=name, district=district, role="Alderman", 
										primary_org="legislature")

		# set additional fields
		person.image = alderman_page.xpath("//div/img/@src")[0]
		phone_number = alderman_page.xpath("//strong[text()='Phone:']/../text()")[1].strip()
		person.add_contact_detail(type="voice", value=phone_number)

		# add sources
		person.add_source(alderman_url, note="profile")
		person.add_source(ward_url, note="ward")

		return person
Пример #30
0
def test_no_membership_for_person():
    org = Organization.objects.create(id="fnd", name="Foundation", classification="foundation",
                                      jurisdiction_id="fnd-jid")

    # import a person with no memberships
    p = ScrapePerson('a man without a country')
    person_imp = PersonImporter('fnd-jid')
    person_imp.import_data([p.as_dict()])

    # try to import a membership
    dumb_imp = DumbMockImporter()
    memimp = MembershipImporter('fnd-jid', person_imp, dumb_imp, dumb_imp)

    with pytest.raises(NoMembershipsError):
        memimp.import_data([])
Пример #31
0
def test_save_object_invalid():
    s = Scraper('jurisdiction', '/tmp/')
    p = Person('Michael Jordan')
    # no source, won't validate

    with pytest.raises(ValueError):
        s.save_object(p)
def test_committee_add_member_person():
    c = Organization('Defense', classification='committee')
    p = Person('John Adams')
    c.add_member(p, role='chairman')
    assert c._related[0].person_id == p._id
    assert c._related[0].organization_id == c._id
    assert c._related[0].role == 'chairman'
Пример #33
0
    def scrape_senator_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
                "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), "
                "' portraitContainer ')]"):
            img = legislator.xpath(".//div[@class='profileThumbnailBoundingBox']/@style")[0]
            img = img[img.find('(')+1:img.find(')')]
            full_name = legislator.xpath(".//div[@class='profileName']/a/text()")[0]
            homepage_url = legislator.xpath(".//a[@class='profileImageLink']")[0].attrib['href']
            district = legislator.xpath(".//div[@class='profileDistrict']"
                                        "/a/text()")[0].split("#")[1]

            if "Vacant" in full_name:
                continue

            homepage = self.get(homepage_url).text
            page = lxml.html.fromstring(homepage)
            phone = page.xpath("//div[@class='phone']/span/text()")[0]

            address_lines = page.xpath("//div[@class='address']/span/text()")
            address = "\n".join(address_lines)

            party_image = page.xpath('//div[@class="senatorParty"]/img/@src')[0]
            if 'Republican' in party_image:
                party = 'Republican'
            elif 'Democrat' in party_image:
                party = 'Democratic'

            email = (
                'rep{0:0{width}}@ohiohouse.gov'
                if chamber == 'lower' else
                'sd{0:0{width}}@ohiosenate.gov'
            ).format(int(district), width=2)

            leg = Person(name=full_name, district=district,
                         primary_org=chamber, image=img, party=party)

            leg.add_contact_detail(type='address', value=address, note='Capitol Office')
            leg.add_contact_detail(type='voice', value=phone, note='Capitol Office')
            leg.add_contact_detail(type='email', value=email, note='Capitol Office')

            leg.add_source(url)
            leg.add_link(homepage_url)
            yield leg
Пример #34
0
def test_full_vote_event():
    j = create_jurisdiction()
    j.legislative_sessions.create(name='1900', identifier='1900')
    sp1 = ScrapePerson('John Smith', primary_org='lower')
    sp2 = ScrapePerson('Adam Smith', primary_org='lower')
    org = ScrapeOrganization(name='House', classification='lower')
    bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org._id)
    vote_event = ScrapeVoteEvent(legislative_session='1900', motion_text='passage',
                                 start_date='1900-04-01', classification='passage:bill',
                                 result='pass', bill_chamber='lower', bill='HB 1',
                                 organization=org._id)
    vote_event.set_count('yes', 20)
    vote_event.yes('John Smith')
    vote_event.no('Adam Smith')

    oi = OrganizationImporter('jid')
    oi.import_data([org.as_dict()])

    pi = PersonImporter('jid')
    pi.import_data([sp1.as_dict(), sp2.as_dict()])

    mi = MembershipImporter('jid', pi, oi, DumbMockImporter())
    mi.import_data([sp1._related[0].as_dict(), sp2._related[0].as_dict()])

    bi = BillImporter('jid', oi, pi)
    bi.import_data([bill.as_dict()])

    VoteEventImporter('jid', pi, oi, bi).import_data([vote_event.as_dict()])

    assert VoteEvent.objects.count() == 1
    ve = VoteEvent.objects.get()
    assert ve.legislative_session == LegislativeSession.objects.get()
    assert ve.motion_classification == ['passage:bill']
    assert ve.bill == Bill.objects.get()
    count = ve.counts.get()
    assert count.option == 'yes'
    assert count.value == 20
    votes = list(ve.votes.all())
    assert len(votes) == 2
    for v in ve.votes.all():
        if v.voter_name == 'John Smith':
            assert v.option == 'yes'
            assert v.voter == Person.objects.get(name='John Smith')
        else:
            assert v.option == 'no'
            assert v.voter == Person.objects.get(name='Adam Smith')
Пример #35
0
def test_multiple_memberships():
    # there was a bug where two or more memberships to the same jurisdiction
    # would cause an ORM error, this test ensures that it is fixed
    p = Person.objects.create(name='Dwayne Johnson')
    o = Organization.objects.create(name='WWE',
                                    jurisdiction_id='jurisdiction-id')
    Membership.objects.create(person=p, organization=o)
    o = Organization.objects.create(name='WWF',
                                    jurisdiction_id='jurisdiction-id')
    Membership.objects.create(person=p, organization=o)

    person = ScrapePerson('Dwayne Johnson')
    pd = person.as_dict()
    PersonImporter('jurisdiction-id').import_data([pd])

    # deduplication should still work
    assert Person.objects.all().count() == 1
Пример #36
0
def test_no_membership_for_person():
    Organization.objects.create(id="fnd",
                                name="Foundation",
                                classification="foundation",
                                jurisdiction_id="fnd-jid")

    # import a person with no memberships
    p = ScrapePerson('a man without a country')
    person_imp = PersonImporter('fnd-jid')
    person_imp.import_data([p.as_dict()])

    # try to import a membership
    dumb_imp = DumbMockImporter()
    memimp = MembershipImporter('fnd-jid', person_imp, dumb_imp, dumb_imp)

    with pytest.raises(NoMembershipsError):
        memimp.import_data([])
Пример #37
0
    def scrape(self):
        urls = Urls(dict(list=legislators_url), self)

        council = Organization('Temecula City Council',
                               classification='legislature')
        council.add_source(urls.list.url)
        yield council

        for tr in urls.list.xpath('//table[2]//tr')[1:]:

            # Parse some attributes.
            name, role = tr.xpath('td/p[1]//font/text()')
            image = tr.xpath('td/img/@src').pop()

            # Create legislator.
            person = Person(name, image=image)

            # Add membership on council.
            memb = person.add_membership(council, role=role)

            # Add email address.
            email, detail_url = tr.xpath('td//a/@href')
            email = email[7:]
            memb.contact_details.append(
                dict(type='email', value=email, note='work'))

            # Add sources.
            person.add_source(urls.list.url)
            person.add_source(detail_url)

            yield person
Пример #38
0
    def scrape_upper_chamber(self, term):
        url = 'https://senado.pr.gov/Pages/Senadores.aspx'

        doc = self.lxmlize(url)
        links = self.get_nodes(doc, '//ul[@class="senadores-list"]/li/a/@href')

        for link in links:
            senator_page = self.lxmlize(link)
            profile_links = self.get_nodes(senator_page, '//ul[@class="profiles-links"]/li')

            name_text = self.get_node(senator_page, '//span[@class="name"]').text_content().strip()
            name = re.sub(r'^Hon\.', '', name_text, flags=re.IGNORECASE).strip()
            party = profile_links[0].text_content().strip()
            photo_url = self.get_node(senator_page, '//div[@class="avatar"]//img/@src')

            if profile_links[1].text_content().strip() == "Senador por Distrito":
                district_text = self.get_node(
                    senator_page,
                    '//div[@class="module-distrito"]//span[@class="headline"]').text_content()
                district = district_text.replace('DISTRITO', '', 1).replace('\u200b', '').strip()
            elif profile_links[1].text_content().strip() == "Senador por Acumulación":
                district = "At-Large"

            phone_node = self.get_node(senator_page, '//a[@class="contact-data tel"]')
            phone = phone_node.text_content().strip()
            email_node = self.get_node(senator_page, '//a[@class="contact-data email"]')
            email = email_node.text_content().replace('\u200b', '').strip()

            person = Person(primary_org='upper',
                            district=district,
                            name=name,
                            party=party,
                            image=photo_url)
            person.add_contact_detail(type='email',
                                      value=email,
                                      note='Capitol Office')
            person.add_contact_detail(type='voice',
                                      value=phone,
                                      note='Capitol Office')
            person.add_link(link)
            person.add_source(link)

            yield person
Пример #39
0
    def scrape_rep(self, url):

        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        main = page.xpath('//div[@id="main-info"]')[0]
        if "Resigned" in main.text_content():
            print("Member resigned {}".format(url))
            raise StopIteration  # don't yield anything

        name = page.xpath('//div[@class="member-name"]/text()')[0].strip()
        name = re.sub(r"\s+", " ", name)
        district_number = page.xpath(
            '//span[contains(text(), "House District:")]'
            "/following-sibling::span/text()")[0].strip()
        # remove anything after first whitespace
        district_number = re.sub(r"\s.*", "", district_number.strip())

        email = None
        email_content = page.xpath(
            '//a[./i[contains(@class,"fa-envelope")]]/text()')
        if email_content and email_content[0].strip():
            email = email_content[0].strip()

        photo_url = page.xpath('//header[@id="home"]/img/@src')[0]

        party = self.get_rep_table_by_header(page,
                                             "Party Affiliation").text.strip()
        party = _party_map[party[0]]  # standardize

        main_p_text = page.xpath('//div[@id="main-info"]/p/text()')
        address = [t.strip() for t in main_p_text if t.strip()][0]

        person = Person(
            name=name,
            district=district_number,
            primary_org="lower",
            party=party,
            image=photo_url,
        )

        person.add_contact_detail(type="address",
                                  value=address,
                                  note="District Office")
        person.add_contact_detail(type="email",
                                  value=email,
                                  note="District Office")

        person.add_source(url)

        yield person
Пример #40
0
    def handle_list_item(self, item):
        photo_url = item.xpath('./td[1]/a/img/@src')[0]
        info_nodes = item.xpath('./td[2]/p/a')
        name_text = info_nodes[0].xpath('./b/text()')[0]
        url = info_nodes[0].get('href')

        name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip('0').upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [
            x.strip() for x in item.xpath(
                './td[2]/p/text()[normalize-space() and preceding-sibling::br]'
            ) if x.strip()
        ]
        address = '\n'.join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_node = info_nodes[1]
        email_text = email_node.text
        email_text = email_text.replace('Email: ', '').strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(name=name,
                     district=district,
                     party=party,
                     primary_org='lower',
                     role='Representative',
                     image=photo_url)
        rep.add_link(url)
        rep.add_contact_detail(type='address', value=address, note='capitol')
        rep.add_contact_detail(type='voice', value=phone, note='capitol')
        rep.add_contact_detail(type='email', value=email, note='capitol')
        rep.add_source(self.url)

        yield rep
Пример #41
0
    def handle_list_item(self, item):
        photo_url = item.xpath("./img/@src")[0]
        url = item.xpath(".//h5/a/@href")[0]
        name_text = item.xpath(".//h5/a/b/text()")[0]

        name_match = re.match(r"^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$", name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip("0").upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [
            x.strip()
            for x in item.xpath("./div/text()[normalize-space()]")
            if x.strip()
        ]
        address = "\n".join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_text = item.xpath(".//a/@href")[1].replace("mailto:", "").strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(
            name=name,
            district=district,
            party=party,
            primary_org="lower",
            role="Representative",
            image=photo_url,
        )
        rep.add_link(url)
        rep.add_contact_detail(type="address", value=address, note="capitol")
        rep.add_contact_detail(type="voice", value=phone, note="capitol")
        rep.add_contact_detail(type="email", value=email, note="capitol")
        rep.add_source(self.url)

        yield rep
Пример #42
0
def test_multiple_orgs_of_same_class():
    """
    We should be able to set memberships on organizations with the
    same classification within the same jurisdictions
    """
    create_jurisdiction()
    Organization.objects.create(id="fnd",
                                name="Foundation",
                                classification="foundation",
                                jurisdiction_id="fnd-jid")
    Organization.objects.create(id="fdr",
                                name="Federation",
                                classification="foundation",
                                jurisdiction_id="fnd-jid")

    hari = ScrapePerson('Hari Seldon',
                        primary_org='foundation',
                        role='founder',
                        primary_org_name='Foundation')

    picard = ScrapePerson('Jean Luc Picard',
                          primary_org='foundation',
                          role='founder',
                          primary_org_name='Federation')

    person_imp = PersonImporter('fnd-jid')
    person_imp.import_data([hari.as_dict()])
    person_imp.import_data([picard.as_dict()])

    # try to import a membership
    org_imp = OrganizationImporter('fnd-jid')
    dumb_imp = DumbMockImporter()
    memimp = MembershipImporter('fnd-jid', person_imp, org_imp, dumb_imp)

    memimp.import_data(
        [hari._related[0].as_dict(), picard._related[0].as_dict()])

    assert Person.objects.get(
        name='Hari Seldon').memberships.get().organization.name == 'Foundation'
    assert Person.objects.get(name='Jean Luc Picard').memberships.get(
    ).organization.name == 'Federation'
Пример #43
0
    def parse_row(self, row, chamber):
        print(row)

        display = '{} {}'.format(row['First Name'], row['Surname'])

        # TODO: map state to ocd
        # TODO: https://www.aph.gov.au/Senators_and_Members/Senators/Senators_by_service_expiry_date

        person = Person(
            name=display,
            district=row['State'],
            role='member',
            primary_org=chamber,
            gender=row['Gender'].lower(),
            party=row['Political Party'],
        )

        person.extras['given_name'] = row['First Name']
        person.extras['family_name'] = row['Surname']

        return person
Пример #44
0
    def scrape_member(self, chamber, link):
        name = link.text.strip()
        leg_url = link.get("href")
        district = link.xpath("string(../../td[3])")
        party = link.xpath("string(../../td[4])")

        # we get email on the next page now
        # email = link.xpath("string(../../td[5])")

        if party == "Democrat":
            party = "Democratic"
        elif party == "No Party Specified":
            party = "Independent"

        pid = re.search(r"personID=(\d+)", link.attrib["href"]).group(1)
        photo_url = ("https://www.legis.iowa.gov/photo"
                     "?action=getPhoto&ga=%s&pid=%s" %
                     (self.latest_session(), pid))

        leg = Person(
            name=name,
            primary_org=chamber,
            district=district,
            party=party,
            image=photo_url,
        )

        leg.add_link(leg_url)
        leg.add_source(leg_url)

        leg_page = lxml.html.fromstring(self.get(link.attrib["href"]).text)
        self.scrape_member_page(leg, leg_page)
        yield leg
Пример #45
0
    def scrape_upper_chamber(self, term):
        url = "http://oksenate.gov/Senators/Default.aspx"
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for a in doc.xpath('//table[@summary]')[0]. \
                xpath('.//td//a[contains(@href, "biographies")]'):
            tail = a.xpath('..')[0].tail
            if tail:
                district = tail.split()[1]
            else:
                district = a.xpath('../../span')[1].text.split()[1]

            if a.text is None or a.text.strip() == 'Vacant':
                self.warning(
                    "District {} appears to be empty".format(district))
                continue
            else:
                match = re.match(r'(.+) \(([A-Z])\)', a.text.strip())
                name, party = match.group(1), self._parties[match.group(2)]

            url = a.get('href')

            person = Person(
                primary_org='upper',
                district=district,
                name=name.strip(),
                party=party,
            )
            person.add_link(url)
            person.add_source(url)
            self.scrape_upper_offices(person, url)
            yield person
Пример #46
0
    def scrape_legislator(self, chamber, name, url):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        district = page.xpath('//h1[contains(., "DISTRICT")]/text()').pop() \
            .split()[1].strip().lstrip('0')

        party = page.xpath('//h2').pop().text_content()
        party = re.search(r'\((R|D|I)[ \-\]]', party).group(1)

        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'
        elif party == 'I':
            party = 'Independent'

        photo_url = page.xpath(
            "//img[contains(@src, 'images/members/')]")[0].attrib['src']

        leg = Person(name,
                     district=district,
                     party=party,
                     image=photo_url,
                     primary_org=chamber)
        leg.add_link(url)
        leg.add_source(url)
        self.scrape_offices(leg, page)

        yield leg
Пример #47
0
    def scrape_lower_legislator(self, url, leg_info):
        page = self.lxmlize(url)

        name = page.xpath(
            '//span[@id="body_FormView5_FULLNAMELabel"]/text()')[0].strip()
        if name.startswith("District ") or name.startswith("Vacant "):
            self.warning("Seat is vacant: {}".format(name))
            return

        photo = page.xpath(
            '//img[contains(@src, "/h_reps/RepPics")]')[0].attrib["src"]
        party_flags = {
            "Democrat": "Democratic",
            "Republican": "Republican",
            "Independent": "Independent",
        }
        party_info = page.xpath(
            '//span[@id="body_FormView5_PARTYAFFILIATIONLabel"]/text()'
        )[0].strip()
        party = party_flags[party_info]
        try:
            email = page.xpath(
                '//span[@id="body_FormView6_EMAILADDRESSPUBLICLabel"]/text()'
            )[0].strip()
        except IndexError:
            email = None
        district = leg_info["dist"].replace("Dist", "").strip()

        person = Person(name=name,
                        party=party,
                        district=district,
                        primary_org="lower",
                        image=photo)

        contacts = [
            (leg_info["office"], "address"),
            (leg_info["phone"], "voice"),
            (email, "email"),
        ]

        for value, key in contacts:
            if value:
                person.add_contact_detail(type=key,
                                          value=value,
                                          note="District Office")

        person.add_source(url)
        person.add_link(url)

        yield person
Пример #48
0
def test_no_membership_for_person_including_party():
    """
    even though party is specified we should still get a no memberships error because it doesn't
    bind the person to a jurisdiction, thus causing duplication
    """
    create_jurisdiction()
    Organization.objects.create(id="fnd", name="Foundation", classification="foundation",
                                jurisdiction_id="fnd-jid")
    Organization.objects.create(id="dem", name="Democratic", classification="party")

    # import a person with no memberships
    p = ScrapePerson('a man without a country', party='Democratic')
    person_imp = PersonImporter('fnd-jid')
    org_imp = OrganizationImporter('fnd-jid')
    person_imp.import_data([p.as_dict()])

    # try to import a membership
    dumb_imp = DumbMockImporter()
    memimp = MembershipImporter('fnd-jid', person_imp, org_imp, dumb_imp)

    with pytest.raises(NoMembershipsError):
        memimp.import_data([p._related[0].as_dict()])
Пример #49
0
    def scrape_chamber(self, chamber):
        self._party_map = {
            'Democrat': 'Democratic',
            'Republican': 'Republican',
            'Non Affiliated': 'Independent',
            'Not Affiliated': 'Independent',
        }

        if chamber == 'upper':
            url = 'http://senate.legis.state.ak.us/'
        else:
            url = 'http://house.legis.state.ak.us/'

        page = self.lxmlize(url)

        items = page.xpath('//ul[@class="item"]')[1].getchildren()

        for item in items:
            photo_url = item.xpath('.//img/@src')[0]
            name = item.xpath('.//strong/text()')[0]
            leg_url = item.xpath('.//a/@href')[0]
            email = item.xpath('.//a[text()="Email Me"]/@href')
            if email:
                email = email[0].replace('mailto:', '')
            else:
                self.warning('no email for ' + name)

            party = district = None
            skip = False

            for dt in item.xpath('.//dt'):
                dd = dt.xpath('following-sibling::dd')[0].text_content()
                label = dt.text.strip()
                if label == 'Party:':
                    party = dd
                elif label == 'District:':
                    district = dd
                elif label.startswith('Deceased'):
                    skip = True
                    self.warning('skipping deceased ' + name)
                    break

            if skip:
                continue

            person = Person(
                primary_org=chamber,
                district=district,
                name=name,
                party=self._party_map[party],
                image=photo_url,
            )
            person.add_source(leg_url)
            person.add_link(leg_url)

            # scrape offices
            self._scrape_offices(person, leg_url, email)

            yield person
Пример #50
0
    def scrape(self):

        # lower
        url = 'http://164.100.47.194/Loksabha/Members/AlphabeticalList.aspx'
        entry = self.get(url).content

        page = lxml.html.fromstring(entry)
        page.make_links_absolute(url)

        for tr in page.xpath(
                '//table[contains(@class,"member_list_table")]/tr'):
            name = tr.xpath('td[2]/a[1]/@title')[0]
            bio = tr.xpath('td[2]/a[1]/@href')[0]
            photo_url = tr.xpath('td[2]/a[1]/img/@src')[0]
            party = tr.xpath('td[3]/text()')[0].strip()
            state = tr.xpath('td[4]/text()')[0].strip()
            member = Person(name=name,
                            role="member",
                            primary_org="lower",
                            party=party,
                            image=photo_url,
                            district=state)

            member.add_source('http://164.100.47.5/Newmembers/memberlist.aspx')
            yield member

        # upper
        url = 'http://164.100.47.5/Newmembers/memberlist.aspx'
        entry = self.get(url).content

        page = lxml.html.fromstring(entry)
        page.make_links_absolute(url)

        for tr in page.xpath(
                '//table[@id="ContentPlaceHolder1_GridView2"]/tr')[1:]:
            name = tr.xpath('td[2]/font/a/text()')[0]
            party_abbr = tr.xpath('td[3]/font/text()')[0].strip()
            state = tr.xpath('td[4]/font/text()')[0].strip()

            member = Person(name=name,
                            role="member",
                            primary_org="upper",
                            party=party_abbr,
                            district=state)

            member.add_source('http://164.100.47.5/Newmembers/memberlist.aspx')
            yield member
Пример #51
0
def test_person_add_membership():
    p = Person('Bob B. Bear')
    p.add_source('http://example.com')
    o = Organization('test org')
    p.add_membership(o, 'member', start_date='2007')
    assert len(p._related) == 1
    p._related[0].validate()
    assert p._related[0].person_id == p._id
    assert p._related[0].organization_id == o._id
    assert p._related[0].start_date == '2007'
Пример #52
0
def test_person_add_membership_org():
    p = Person('Bob B. Bear')
    p.add_source('http://example.com')
    o = Organization('test org', classification='unknown')
    p.add_membership(o, role='member', start_date='2007', end_date=datetime.date(2015, 5, 8))
    assert len(p._related) == 1
    p._related[0].validate()
    assert p._related[0].person_id == p._id
    assert p._related[0].organization_id == o._id
    assert p._related[0].start_date == '2007'
    assert p._related[0].end_date == datetime.date(2015, 5, 8)
Пример #53
0
def test_same_name_people_other_name():
    create_jurisdiction()
    # ensure we're taking other_names into account for the name collision code
    Organization.objects.create(name='WWE', jurisdiction_id='jid')
    p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1')
    p2 = ScrapePerson('Rock', image='http://example.com/2')
    p2.add_name('Dwayne Johnson')

    # the people have the same name but are apparently different
    with pytest.raises(SameNameError):
        PersonImporter('jid').import_data([p1.as_dict(), p2.as_dict()])
Пример #54
0
def table_row_to_legislator_and_profile_url(table_row_element, chamber):
    """Derive a Legislator from an HTML table row lxml Element, and a link to their profile"""
    td_elements = table_row_element.xpath("td")
    (
        role_element,
        name_element,
        district_element,
        party_element,
        phone_element,
        email_element,
    ) = td_elements

    # Name comes in the form Last, First
    # last_name_first_name = name_element.text_content().strip()
    # full_name = last_name_first_name_to_full_name(last_name_first_name)
    full_name = name_element.text_content().strip()
    if full_name.count(", ") == 1:
        full_name = " ".join(full_name.split(", ")[::-1]).strip()
    district = district_element.text_content().strip()
    party = party_element.text_content().strip()
    if party == "Democrat":
        party = "Democratic"
    elif party == "Unaffiliated":
        party = "Independent"

    role = role_element.text_content().strip()
    address = co_address_from_role(role)
    phone = phone_element.text_content().strip()
    email = email_element.text_content().strip()

    (profile_url,) = name_element.xpath("a/@href")
    print(chamber, district, party)
    legislator = Person(
        primary_org=chamber, name=full_name, district=district, party=party
    )
    legislator.add_contact_detail(type="address", value=address, note="Capitol Office")
    if phone:
        legislator.add_contact_detail(type="voice", value=phone, note="Capitol Office")
    if email:
        legislator.add_contact_detail(type="email", value=email, note="Capitol Office")

    return legislator, profile_url
Пример #55
0
def test_same_name_second_import():
    create_jurisdiction()
    # ensure two people with the same name don't import without birthdays
    o = Organization.objects.create(name='WWE', jurisdiction_id='jid')
    p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1')
    p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2')
    p1.birth_date = '1970'
    p2.birth_date = '1930'

    # when we give them birth dates all is well though
    PersonImporter('jid').import_data([p1.as_dict(), p2.as_dict()])

    # fake some memberships so future lookups work on these people
    for p in Person.objects.all():
        Membership.objects.create(person=p, organization=o)

    p3 = ScrapePerson('Dwayne Johnson', image='http://example.com/3')

    with pytest.raises(SameNameError):
        PersonImporter('jid').import_data([p3.as_dict()])
Пример #56
0
def test_bill_sponsor_limit_lookup():
    create_jurisdiction()
    org = create_org()

    bill = ScrapeBill('HB 1',
                      '1900',
                      'Axe & Tack Tax Act',
                      classification='tax bill',
                      chamber='lower')
    bill.add_sponsorship_by_identifier(name="SNODGRASS",
                                       classification='sponsor',
                                       entity_type='person',
                                       primary=True,
                                       identifier="TOTALLY_REAL_ID",
                                       scheme="TOTALLY_REAL_SCHEME")

    oi = OrganizationImporter('jid')
    pi = PersonImporter('jid')

    zs = ScrapePerson(name='Zadock Snodgrass', birth_date="1800-01-01")
    zs.add_identifier(identifier='TOTALLY_REAL_ID',
                      scheme='TOTALLY_REAL_SCHEME')
    pi.import_data([zs.as_dict()])

    za_db = Person.objects.get()
    Membership.objects.create(person_id=za_db.id, organization_id=org.id)

    zs2 = ScrapePerson(name='Zadock Snodgrass', birth_date="1900-01-01")
    zs2.add_identifier(identifier='TOTALLY_REAL_ID',
                       scheme='TOTALLY_REAL_SCHEME')

    # This is contrived and perhaps broken, but we're going to check this.
    # We *really* don't want to *ever* cross jurisdiction bounds.
    PersonImporter('another-jurisdiction').import_data([zs.as_dict()])

    BillImporter('jid', oi, pi).import_data([bill.as_dict()])

    obj = Bill.objects.get()
    (entry, ) = obj.sponsorships.all()
    assert entry.person.name == "Zadock Snodgrass"
    assert entry.person.birth_date == "1800-01-01"
Пример #57
0
    def handle_list_item(self, item):
        name = item.text

        if 'resigned' in name.lower() or 'vacated' in name.lower():
            return
        if (name in CHAMBER_MOVES and (self.chamber != CHAMBER_MOVES[name])):
            return

        name, action, date = clean_name(name)

        leg = Person(name=name)
        leg.add_source(self.url)
        leg.add_source(item.get('href'))
        leg.add_link(item.get('href'))
        yield from self.scrape_page(
            self.detail_page,
            item.get('href'),
            session=self.kwargs['session'],
            obj=leg,
        )
        yield leg
Пример #58
0
def table_row_to_legislator_and_profile_url(table_row_element, chamber):
    """Derive a Legislator from an HTML table row lxml Element, and a link to their profile"""
    td_elements = table_row_element.xpath('td')
    (role_element, name_element, district_element, party_element,
     phone_element, email_element) = td_elements

    # Name comes in the form Last, First
    # last_name_first_name = name_element.text_content().strip()
    # full_name = last_name_first_name_to_full_name(last_name_first_name)
    full_name = name_element.text_content().strip()
    district = district_element.text_content().strip()
    party = party_element.text_content().strip()
    if party == 'Democrat':
        party = 'Democratic'

    role = role_element.text_content().strip()
    address = co_address_from_role(role)
    phone = phone_element.text_content().strip()
    email = email_element.text_content().strip()

    (profile_url, ) = name_element.xpath('a/@href')
    print(chamber, district, party)
    legislator = Person(primary_org=chamber,
                        name=full_name,
                        district=district,
                        party=party)
    legislator.add_contact_detail(type='address',
                                  value=address,
                                  note='Capitol Office')
    if phone:
        legislator.add_contact_detail(type='voice',
                                      value=phone,
                                      note='Capitol Office')
    if email:
        legislator.add_contact_detail(type='email',
                                      value=email,
                                      note='Capitol Office')

    return legislator, profile_url
Пример #59
0
    def handle_list_item(self, item):
        name = item.text

        lname = name.lower()
        if "resigned" in lname or "vacated" in lname or "retired" in lname:
            return
        if name in CHAMBER_MOVES and (self.chamber != CHAMBER_MOVES[name]):
            return

        name, action, date = clean_name(name)

        leg = Person(name=name)
        leg.add_source(self.url)
        leg.add_source(item.get("href"))
        leg.add_link(item.get("href"))
        yield from self.scrape_page(
            self.detail_page,
            item.get("href"),
            session=self.kwargs["session"],
            committees=self.kwargs["committees"],
            obj=leg,
        )
        yield leg
Пример #60
0
    def handle_list_item(self, item):
        name = " ".join(item.xpath('.//text()'))
        name = re.sub(r'\s+', " ", name).replace(" ,", ",").strip()

        if 'Vacant' in name:
            return

        district = item.xpath("string(../../td[1])")
        party = item.xpath("string(../../td[2])")
        if party == 'Democrat':
            party = 'Democratic'

        leg_url = item.get('href')

        name = fix_name(name)
        leg = Person(name=name, district=district, party=party,
                     primary_org='upper', role='Senator')
        leg.add_link(leg_url)
        leg.add_source(self.url)
        leg.add_source(leg_url)

        self.scrape_page(SenDetail, leg_url, obj=leg)

        return leg