Пример #1
0
def test_same_name_people():
    # ensure two people with the same name don't import without birthdays
    o = Organization.objects.create(name='WWE',
                                    jurisdiction_id='jurisdiction-id')
    p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1')
    p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2')

    # the people have the same name but are apparently different
    with pytest.raises(SameNameError):
        PersonImporter('jurisdiction-id').import_data(
            [p1.as_dict(), p2.as_dict()])

    # when we give them birth dates all is well though
    p1.birth_date = '1970'
    p2.birth_date = '1930'
    resp = PersonImporter('jurisdiction-id').import_data(
        [p1.as_dict(), p2.as_dict()])
    assert resp['person']['insert'] == 2
    assert resp['person']['noop'] == 0
    assert resp['person']['update'] == 0
    assert Person.objects.count() == 2

    # fake some memberships so future lookups work on these people
    for p in Person.objects.all():
        Membership.objects.create(person=p, organization=o)

    # and now test that an update works and we can insert a new one with the same name
    p1.image = 'http://example.com/1.jpg'
    p2.birth_date = '1931'  # change birth_date, means a new insert
    resp = PersonImporter('jurisdiction-id').import_data(
        [p1.as_dict(), p2.as_dict()])
    assert Person.objects.count() == 3
    assert resp['person']['insert'] == 1
    assert resp['person']['noop'] == 0
    assert resp['person']['update'] == 1
Пример #2
0
def test_same_name_people():
    # ensure two people with the same name don't import without birthdays
    o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id')
    p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1')
    p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2')

    # the people have the same name but are apparently different
    with pytest.raises(SameNameError):
        PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])

    # when we give them birth dates all is well though
    p1.birth_date = '1970'
    p2.birth_date = '1930'
    resp = PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])
    assert resp['person']['insert'] == 2
    assert resp['person']['noop'] == 0
    assert resp['person']['update'] == 0
    assert Person.objects.count() == 2

    # fake some memberships so future lookups work on these people
    for p in Person.objects.all():
        Membership.objects.create(person=p, organization=o)

    # and now test that an update works and we can insert a new one with the same name
    p1.image = 'http://example.com/1.jpg'
    p2.birth_date = '1931'  # change birth_date, means a new insert
    resp = PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])
    assert Person.objects.count() == 3
    assert resp['person']['insert'] == 1
    assert resp['person']['noop'] == 0
    assert resp['person']['update'] == 1
Пример #3
0
def test_same_name_people():
    o = Organization.objects.create(name='WWE',
                                    jurisdiction_id='jurisdiction-id')

    # importing two people with the same name to a pristine database should error
    p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1')
    p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2')
    with pytest.raises(SameNameError):
        PersonImporter('jurisdiction-id').import_data(
            [p1.as_dict(), p2.as_dict()])

    # importing one person should pass
    PersonImporter('jurisdiction-id').import_data([p1.as_dict()])
    # create fake memberships so that future lookups work on the imported people
    for p in Person.objects.all():
        Membership.objects.create(person=p, organization=o)

    # importing another person with the same name should fail
    with pytest.raises(SameNameError):
        PersonImporter('jurisdiction-id').import_data(
            [p1.as_dict(), p2.as_dict()])

    # adding birth dates should pass
    p1.birth_date = '1970'
    p2.birth_date = '1930'
    resp = PersonImporter('jurisdiction-id').import_data(
        [p1.as_dict(), p2.as_dict()])
    assert resp['person']['insert'] == 1
    assert resp['person']['noop'] == 0
    assert resp['person']['update'] == 1
    assert Person.objects.count() == 2
    # create fake memberships so that future lookups work on the imported people
    for p in Person.objects.all():
        Membership.objects.create(person=p, organization=o)

    # adding a third person with the same name but without a birthday should error
    p3 = ScrapePerson('Dwayne Johnson', image='http://example.com/3')

    with pytest.raises(SameNameError):
        PersonImporter('jurisdiction-id').import_data([p3.as_dict()])

    # and now test that an update works and we can insert a new one with the same name
    p1.image = 'http://example.com/1.jpg'
    p2.birth_date = '1931'  # change birth_date, means a new insert
    resp = PersonImporter('jurisdiction-id').import_data(
        [p1.as_dict(), p2.as_dict()])
    assert Person.objects.count() == 3
    assert resp['person']['insert'] == 1
    assert resp['person']['noop'] == 0
    assert resp['person']['update'] == 1
Пример #4
0
def test_same_name_people():
    o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id')

    # importing two people with the same name to a pristine database should error
    p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1')
    p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2')
    with pytest.raises(SameNameError):
        PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])

    # importing one person should pass
    PersonImporter('jurisdiction-id').import_data([p1.as_dict()])
    # create fake memberships so that future lookups work on the imported people
    for p in Person.objects.all():
        Membership.objects.create(person=p, organization=o)

    # importing another person with the same name should fail
    with pytest.raises(SameNameError):
        PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])

    # adding birth dates should pass
    p1.birth_date = '1970'
    p2.birth_date = '1930'
    resp = PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])
    assert resp['person']['insert'] == 1
    assert resp['person']['noop'] == 0
    assert resp['person']['update'] == 1
    assert Person.objects.count() == 2
    # create fake memberships so that future lookups work on the imported people
    for p in Person.objects.all():
        Membership.objects.create(person=p, organization=o)

    # adding a third person with the same name but without a birthday should error
    p3 = ScrapePerson('Dwayne Johnson', image='http://example.com/3')

    with pytest.raises(SameNameError):
        PersonImporter('jurisdiction-id').import_data([p3.as_dict()])

    # and now test that an update works and we can insert a new one with the same name
    p1.image = 'http://example.com/1.jpg'
    p2.birth_date = '1931'  # change birth_date, means a new insert
    resp = PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])
    assert Person.objects.count() == 3
    assert resp['person']['insert'] == 1
    assert resp['person']['noop'] == 0
    assert resp['person']['update'] == 1
Пример #5
0
def test_same_name_second_import():
    # ensure two people with the same name don't import without birthdays
    o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id')
    p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1')
    p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2')
    p1.birth_date = '1970'
    p2.birth_date = '1930'

    # when we give them birth dates all is well though
    resp = PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])

    # fake some memberships so future lookups work on these people
    for p in Person.objects.all():
        Membership.objects.create(person=p, organization=o)

    p3 = ScrapePerson('Dwayne Johnson', image='http://example.com/3')

    with pytest.raises(SameNameError):
        resp = PersonImporter('jurisdiction-id').import_data([p3.as_dict()])
Пример #6
0
def test_same_name_second_import():
    create_jurisdiction()
    # ensure two people with the same name don't import without birthdays
    o = Organization.objects.create(name='WWE', jurisdiction_id='jid')
    p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1')
    p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2')
    p1.birth_date = '1970'
    p2.birth_date = '1930'

    # when we give them birth dates all is well though
    PersonImporter('jid').import_data([p1.as_dict(), p2.as_dict()])

    # fake some memberships so future lookups work on these people
    for p in Person.objects.all():
        Membership.objects.create(person=p, organization=o)

    p3 = ScrapePerson('Dwayne Johnson', image='http://example.com/3')

    with pytest.raises(SameNameError):
        PersonImporter('jid').import_data([p3.as_dict()])
Пример #7
0
    def scrape_legislator(self, legislator_id):
        old = self.api('legislators/' + legislator_id + '?')
        # just not needed
        id = old.pop('id')

        old.pop('created_at')
        old.pop('updated_at')
        old.pop('country', None)
        old.pop('level', None)
        old.pop('state')
        old.pop('leg_id')
        old.pop('active')
        # junk keys
        old.pop('suffix', None)
        old.pop('notice', None)
        old.pop('csrfmiddlewaretoken', None)
        old.pop('office_address', None)
        old.pop('office_phone', None)

        # translated
        district = old.pop('district', None)
        chamber = old.pop('chamber', None)
        image = old.pop('photo_url', '')
        name = old.pop('full_name')
        party = old.pop('party', None)

        if party in ('Nonpartisan', 'unknown', 'Unknown', 'Unaffiliated', "Non Affiliated", " "):
            party = None
        elif party == 'Democrat':
            party = 'Democratic'

        if self.state in('ne', 'dc'):
            chamber = 'legislature'

        if chamber == 'upper' and self.state == 'pr':
            pr_district = {
                '1': 'I',
                '2': 'II',
                '3': 'III',
                '4': 'IV',
                '5': 'V',
                '6': 'VI',
                '7': 'VII',
                '8': 'VIII',
            }
            if district in pr_district:
                district = pr_district[district]

        if '2008-2011' in old:
            old['old_roles']['2008-2011'] = old.pop('2008-2011')

        old_roles = old.pop('old_roles', {})

        if old['roles'] and 'Lt. Governor' in [x['type'] for x in old['roles']]:
            new = Person(name=name, district=district, party=party, image=image)
            self.jurisdiction._executive.add_post(
                'Lt. Governor',
                'lt-gov'
            )
            membership = Membership(
                person_id=new._id,
                role="Lt. Governor",
                organization_id=self.jurisdiction._executive._id
            )
            new._related.append(membership)
        else:
            new = Person(name=name, party=party, image=image)

        if id in birthdays:
            new.birth_date = birthdays[id]

        # various ids
        id_types = {'votesmart_id': 'votesmart',
                    'transparencydata_id': 'influence-explorer',
                    'nimsp_id': 'nimsp',
                    'nimsp_candidate_id': 'nimsp-candidate',
                   }
        for idname, scheme in id_types.items():
            val = old.pop(idname, None)
            if val:
                new.add_identifier(val, scheme=scheme)
        for id in old.pop('all_ids'):
            new.add_identifier(id, scheme='openstates')
            self._people[id] = new

        # contact details
        email = old.pop('email', None)
        if email:
            new.add_contact_detail(type='email', value=email, note='')
        office_keys = {'fax': 'fax',
                       'phone': 'voice',
                       'email': 'email',
                       'address': 'address'}
        for office in old.pop('offices'):
            for key, type in office_keys.items():
                if office.get(key):
                    if 'Office Hours' in office[key] and self.state == 'pa':
                        for x in office[key].split('Office Hours: '):
                            if x:
                                new.add_contact_detail(type=type, value=x, note=office['name'])
                    else:
                        new.add_contact_detail(type=type, value=office[key], note=office['name'])

        # links
        link = old.pop('url', None)
        if link:
            new.add_link(link)

        #for utah, conflict of interest is in links
        if self.state == 'ut':
            links = old.pop('+links',[])
            for l in links:
                new.add_link(note="conflict of interest form",url=l)

        # sources
        for source in old.pop('sources'):
            source.pop('retrieved', None)
            source.pop('+page', None)
            new.add_source(**source)

        # roles
        for role in old.pop('roles'):
            self.process_role(new, role, leg_id=id)

        for role_list in old_roles.values():
            for role in role_list:
                self.process_role(new, role, leg_id=id)

        # ignore most of the names for now
        old.pop('first_name')
        old.pop('middle_name')
        old.pop('suffixes')
        old.pop('nickname', None)
        new.sort_name = old.pop('last_name')

        #some places have legacy names without underscores
        old.pop('+firstname', None)
        old.pop('+lastname', None)

        gender = old.pop('+gender', None)
        if gender:
            new.gender = gender
        biography = old.pop('+biography', None)
        if biography:
            new.biography = biography
        birth_date = old.pop('+birth_date', None)
        if birth_date:
            new.birth_date = birth_date

        # keys to keep
        to_extras = ['+occupation', '+twitter', '+facebook_url', '+sworn_in_date', '+profession',
                     '+secretary', '+office_hours', '+resident_county', '+district_name',
                     '+leg_status', '+legal_position', '+title', '+start_year',
                     '+end_date', 'occupation', '+oregon_member_id',
                     '+facebook', '+youtube', '+instagram']
        for k in to_extras:
            v = old.pop(k, None)
            if v:
                new.extras[k.replace('+', '')] = v

        # keys not to keep
        to_pop = ['+office_fax', '+phone', '+room', '+fax', '+email', '+url', '+photo', '+notice',
                  '+page', '+suffix', '+city', '+address', '+additional_info_url', '+contact_form',
                  '+fax_number', '+phone_number', '+business_phone', '+email_address', '+img_url',
                  '+office_phone', '+disctict_name', '+office_loc', '+leg_url', '+office',
                  '+district_address', '+capital_address', '+bis_phone', '+capital_phone',
                  '+org_info', '+role', '+other_phone', '+home_phone', '+zip', '+zipcode',
                  '+county', '+capitol_phone', '+image_url', '+header', '+town_represented',
                  '+full_address', '+capitol_address', '+website', '+district_phone',
                  '+district_offices', '+party', '+district', '+capitol_office', '+office_address',
                 ]
        for k in to_pop:
            old.pop(k, None)

        # ensure we got it all
        assert not old, old.keys()

        return new
Пример #8
0
    def scrape_legislator(self, legislator_id):
        old = self.api('legislators/' + legislator_id + '?')
        # just not needed
        id = old.pop('id')

        old.pop('created_at')
        old.pop('updated_at')
        old.pop('country', None)
        old.pop('level', None)
        old.pop('state')
        old.pop('leg_id')
        old.pop('active')
        # junk keys
        old.pop('suffix', None)
        old.pop('notice', None)
        old.pop('csrfmiddlewaretoken', None)
        old.pop('office_address', None)
        old.pop('office_phone', None)

        # translated
        district = old.pop('district', None)
        chamber = old.pop('chamber', None)
        image = old.pop('photo_url', '')
        name = old.pop('full_name')
        party = old.pop('party', None)

        if party in ('Nonpartisan', 'unknown', 'Unknown', 'Unaffiliated',
                     "Non Affiliated", " "):
            party = None
        elif party == 'Democrat':
            party = 'Democratic'

        if self.state in ('ne', 'dc'):
            chamber = 'legislature'

        if chamber == 'upper' and self.state == 'pr':
            pr_district = {
                '1': 'I',
                '2': 'II',
                '3': 'III',
                '4': 'IV',
                '5': 'V',
                '6': 'VI',
                '7': 'VII',
                '8': 'VIII',
            }
            if district in pr_district:
                district = pr_district[district]

        if '2008-2011' in old:
            old['old_roles']['2008-2011'] = old.pop('2008-2011')

        old_roles = old.pop('old_roles', {})

        if old['roles'] and 'Lt. Governor' in [
                x['type'] for x in old['roles']
        ]:
            new = Person(name=name,
                         district=district,
                         party=party,
                         image=image)
            self.jurisdiction._executive.add_post('Lt. Governor', 'lt-gov')
            membership = Membership(
                person_id=new._id,
                role="Lt. Governor",
                organization_id=self.jurisdiction._executive._id)
            new._related.append(membership)
        else:
            new = Person(name=name, party=party, image=image)

        if id in birthdays:
            new.birth_date = birthdays[id]

        # various ids
        id_types = {
            'votesmart_id': 'votesmart',
            'transparencydata_id': 'influence-explorer',
            'nimsp_id': 'nimsp',
            'nimsp_candidate_id': 'nimsp-candidate',
        }
        for idname, scheme in id_types.items():
            val = old.pop(idname, None)
            if val:
                new.add_identifier(val, scheme=scheme)
        for id in old.pop('all_ids'):
            new.add_identifier(id, scheme='openstates')
            self._people[id] = new

        # contact details
        email = old.pop('email', None)
        if email:
            new.add_contact_detail(type='email', value=email, note='')
        office_keys = {
            'fax': 'fax',
            'phone': 'voice',
            'email': 'email',
            'address': 'address'
        }
        for office in old.pop('offices'):
            for key, type in office_keys.items():
                if office.get(key):
                    if 'Office Hours' in office[key] and self.state == 'pa':
                        for x in office[key].split('Office Hours: '):
                            if x:
                                new.add_contact_detail(type=type,
                                                       value=x,
                                                       note=office['name'])
                    else:
                        new.add_contact_detail(type=type,
                                               value=office[key],
                                               note=office['name'])

        # links
        link = old.pop('url', None)
        if link:
            new.add_link(link)

        #for utah, conflict of interest is in links
        if self.state == 'ut':
            links = old.pop('+links', [])
            for l in links:
                new.add_link(note="conflict of interest form", url=l)

        # sources
        for source in old.pop('sources'):
            source.pop('retrieved', None)
            source.pop('+page', None)
            new.add_source(**source)

        # roles
        for role in old.pop('roles'):
            self.process_role(new, role, leg_id=id)

        for role_list in old_roles.values():
            for role in role_list:
                self.process_role(new, role, leg_id=id)

        # ignore most of the names for now
        old.pop('first_name')
        old.pop('middle_name')
        old.pop('suffixes')
        old.pop('nickname', None)
        new.sort_name = old.pop('last_name')

        #some places have legacy names without underscores
        old.pop('+firstname', None)
        old.pop('+lastname', None)

        gender = old.pop('+gender', None)
        if gender:
            new.gender = gender
        biography = old.pop('+biography', None)
        if biography:
            new.biography = biography
        birth_date = old.pop('+birth_date', None)
        if birth_date:
            new.birth_date = birth_date

        # keys to keep
        to_extras = [
            '+occupation', '+twitter', '+facebook_url', '+sworn_in_date',
            '+profession', '+secretary', '+office_hours', '+resident_county',
            '+district_name', '+leg_status', '+legal_position', '+title',
            '+start_year', '+end_date', 'occupation', '+oregon_member_id',
            '+facebook', '+youtube', '+instagram'
        ]
        for k in to_extras:
            v = old.pop(k, None)
            if v:
                new.extras[k.replace('+', '')] = v

        # keys not to keep
        to_pop = [
            '+office_fax',
            '+phone',
            '+room',
            '+fax',
            '+email',
            '+url',
            '+photo',
            '+notice',
            '+page',
            '+suffix',
            '+city',
            '+address',
            '+additional_info_url',
            '+contact_form',
            '+fax_number',
            '+phone_number',
            '+business_phone',
            '+email_address',
            '+img_url',
            '+office_phone',
            '+disctict_name',
            '+office_loc',
            '+leg_url',
            '+office',
            '+district_address',
            '+capital_address',
            '+bis_phone',
            '+capital_phone',
            '+org_info',
            '+role',
            '+other_phone',
            '+home_phone',
            '+zip',
            '+zipcode',
            '+county',
            '+capitol_phone',
            '+image_url',
            '+header',
            '+town_represented',
            '+full_address',
            '+capitol_address',
            '+website',
            '+district_phone',
            '+district_offices',
            '+party',
            '+district',
            '+capitol_office',
            '+office_address',
        ]
        for k in to_pop:
            old.pop(k, None)

        # ensure we got it all
        assert not old, old.keys()

        return new
    def legislators(self, latest_only):
        legs = {}

        for member, chamber, term, url in self._memberships(latest_only):
            name, _, _, district, party = member.xpath('td')
            district = district.text
            detail_url = name.xpath('a/@href')[0]

            if party.text_content().strip() == "":
                self.warning("Garbage party: Skipping!")
                continue

            party = {
                'D': 'Democratic',
                'R': 'Republican',
                'I': 'Independent'
            }[party.text]
            name = name.text_content().strip()

            # inactive legislator, skip them for now
            if name.endswith('*'):
                name = name.strip('*')
                continue

            name = AKA.get(name, name)

            if name in legs:
                p, terms = legs[name]
                terms.append((chamber, district, term, party))
            else:
                p = Person(name, party=party)
                legs[name] = p, [(chamber, district, term, party)]

            p.add_source(url)
            p.add_source(detail_url)
            p.add_link(detail_url)

            birth_date = BIRTH_DATES.get(name, None)
            if birth_date:
                p.birth_date = birth_date

            leg_html = self.get(detail_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(detail_url)

            hotgarbage = ('Senate Biography Information for the 98th General '
                          'Assembly is not currently available.')

            if hotgarbage in leg_html:
                # The legislator's bio isn't available yet.
                self.logger.warning('No legislator bio available for ' + name)
                continue

            photo_url = leg_doc.xpath(
                '//img[contains(@src, "/members/")]/@src')[0]
            p.image = photo_url

            p.contact_details = []
            # email
            email = leg_doc.xpath('//b[text()="Email: "]')
            if email:
                p.add_contact_detail(type='email',
                                     value=email[0].tail.strip(),
                                     note='capitol')

            offices = {
                'capitol': '//table[contains(string(), "Springfield Office")]',
                'district': '//table[contains(string(), "District Office")]'
            }

            for location, xpath in offices.items():
                table = leg_doc.xpath(xpath)
                if table:
                    for type, value in self._table_to_office(table[3]):
                        p.add_contact_detail(type=type,
                                             value=value,
                                             note=location)

        return legs
Пример #10
0
    def legislators(self, latest_only):
        legs = {}

        for member, chamber, term, url in self._memberships(latest_only):
            name, _, _, district, party = member.xpath('td')
            district = district.text
            detail_url = name.xpath('a/@href')[0]

            if party.text_content().strip() == "":
                self.warning("Garbage party: Skipping!")
                continue

            party = {'D': 'Democratic', 'R': 'Republican', 'I': 'Independent'}[party.text]
            name = name.text_content().strip()

            # inactive legislator, skip them for now
            if name.endswith('*'):
                name = name.strip('*')
                continue

            name = AKA.get(name, name)

            if name in legs:
                p, terms = legs[name]
                terms.append((chamber, district, term, party))
            else:
                p = Person(name, party=party)
                legs[name] = p, [(chamber, district, term, party)]

            p.add_source(url)
            p.add_source(detail_url)
            p.add_link(detail_url)

            birth_date = BIRTH_DATES.get(name, None)
            if birth_date:
                p.birth_date = birth_date

            leg_html = self.get(detail_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(detail_url)

            hotgarbage = (
                'Senate Biography Information for the 98th General '
                'Assembly is not currently available.')

            if hotgarbage in leg_html:
                # The legislator's bio isn't available yet.
                self.logger.warning('No legislator bio available for ' + name)
                continue

            photo_url = leg_doc.xpath('//img[contains(@src, "/members/")]/@src')[0]
            p.image = photo_url

            p.contact_details = []
            # email
            email = leg_doc.xpath('//b[text()="Email: "]')
            if email:
                p.add_contact_detail(type='email', value=email[0].tail.strip(), note='capitol')

            offices = {'capitol': '//table[contains(string(), "Springfield Office")]',
                       'district': '//table[contains(string(), "District Office")]'}

            for location, xpath in offices.items():
                table = leg_doc.xpath(xpath)
                if table:
                    for type, value in self._table_to_office(table[3]):
                        if type in ('fax', 'voice') and not validate_phone_number(value):
                            continue

                        p.add_contact_detail(type=type, value=value, note=location)

        return legs