Пример #1
0
def test_full_person():
    person = ScrapePerson('Tom Sawyer')
    person.add_identifier('1')
    person.add_name('Tommy', start_date='1880')
    person.add_contact_detail(type='phone',
                              value='555-555-1234',
                              note='this is fake')
    person.add_link('http://example.com/link')
    person.add_source('http://example.com/source')

    # import person
    pd = person.as_dict()
    PersonImporter('jurisdiction-id').import_data([pd])

    # get person from db and assert it imported correctly
    p = Person.objects.get()
    assert 'ocd-person' in p.id
    assert p.name == person.name

    assert p.identifiers.all()[0].identifier == '1'
    assert p.identifiers.all()[0].scheme == ''

    assert p.other_names.all()[0].name == 'Tommy'
    assert p.other_names.all()[0].start_date == '1880'

    assert p.contact_details.all()[0].type == 'phone'
    assert p.contact_details.all()[0].value == '555-555-1234'
    assert p.contact_details.all()[0].note == 'this is fake'

    assert p.links.all()[0].url == 'http://example.com/link'
    assert p.sources.all()[0].url == 'http://example.com/source'
Пример #2
0
def test_full_person():
    person = ScrapePerson('Tom Sawyer')
    person.add_identifier('1')
    person.add_name('Tommy', start_date='1880')
    person.add_contact_detail(type='phone', value='555-555-1234', note='this is fake')
    person.add_link('http://example.com/link')
    person.add_source('http://example.com/source')

    # import person
    pd = person.as_dict()
    PersonImporter('jurisdiction-id').import_data([pd])

    # get person from db and assert it imported correctly
    p = Person.objects.get()
    assert 'ocd-person' in p.id
    assert p.name == person.name

    assert p.identifiers.all()[0].identifier == '1'
    assert p.identifiers.all()[0].scheme == ''

    assert p.other_names.all()[0].name == 'Tommy'
    assert p.other_names.all()[0].start_date == '1880'

    assert p.contact_details.all()[0].type == 'phone'
    assert p.contact_details.all()[0].value == '555-555-1234'
    assert p.contact_details.all()[0].note == 'this is fake'

    assert p.links.all()[0].url == 'http://example.com/link'
    assert p.sources.all()[0].url == 'http://example.com/source'
Пример #3
0
def test_deduplication_other_name_overlaps():
    create_person()
    # Person has other_name that overlaps w/ existing name
    person = ScrapePerson('The Rock')
    person.add_name('Dwayne Johnson')
    pd = person.as_dict()
    PersonImporter('jurisdiction-id').import_data([pd])
    assert Person.objects.all().count() == 1
Пример #4
0
def test_deduplication_other_name_overlaps():
    create_person()
    # Person has other_name that overlaps w/ existing name
    person = ScrapePerson('The Rock')
    person.add_name('Dwayne Johnson')
    pd = person.as_dict()
    PersonImporter('jurisdiction-id').import_data([pd])
    assert Person.objects.all().count() == 1
Пример #5
0
def test_same_name_people_other_name():
    # ensure we're taking other_names into account for the name collision code
    o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id')
    p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1')
    p2 = ScrapePerson('Rock', image='http://example.com/2')
    p2.add_name('Dwayne Johnson')

    # the people have the same name but are apparently different
    with pytest.raises(SameNameError):
        PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])
Пример #6
0
def test_same_name_people_other_name():
    create_jurisdiction()
    # ensure we're taking other_names into account for the name collision code
    Organization.objects.create(name='WWE', jurisdiction_id='jid')
    p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1')
    p2 = ScrapePerson('Rock', image='http://example.com/2')
    p2.add_name('Dwayne Johnson')

    # the people have the same name but are apparently different
    with pytest.raises(SameNameError):
        PersonImporter('jid').import_data([p1.as_dict(), p2.as_dict()])
Пример #7
0
    def scrape(self):
        noncommittees = {'Committee of the Whole'}
        committee_d = {}

        people_d = {}

        # Go to memberlist
        extra_args = {'ctl00$ContentPlaceHolder$lstName': 'City Council'}

        for councilman, committees in self.councilMembers(
                extra_args=extra_args):

            if 'url' in councilman['Person Name']:
                councilman_url = councilman['Person Name']['url']

                if councilman_url in people_d:
                    people_d[councilman_url][0].append(councilman)
                else:
                    people_d[councilman_url] = [councilman], committees

        for person_entries, committees in people_d.values():

            councilman = person_entries[-1]

            p = Person(councilman['Person Name']['label'])

            if p.name == 'Letitia James':
                p.name = 'Letitia Ms. James'
                p.add_name('Letitia James')

            spans = [(self.toTime(entry['Start Date']).date(),
                      self.toTime(entry['End Date']).date(), entry['District'])
                     for entry in person_entries]

            merged_spans = []
            last_end_date = None
            last_district = None
            for start_date, end_date, district in sorted(spans):
                if last_end_date is None:
                    span = [start_date, end_date, district]
                elif (start_date - last_end_date
                      ) == datetime.timedelta(1) and district == last_district:
                    span[1] = end_date
                else:
                    merged_spans.append(span)
                    span = [start_date, end_date, district]

                last_end_date = end_date
                last_district = district

            merged_spans.append(span)

            for start_date, end_date, district in merged_spans:
                district = councilman['District'].replace(' 0', ' ')
                if end_date == datetime.date(2017, 12, 31):
                    end_date = ''
                else:
                    end_date = end_date.isoformat()
                print(start_date, end_date)
                p.add_term('Council Member',
                           'legislature',
                           district=district,
                           start_date=start_date.isoformat(),
                           end_date=end_date)

            party = councilman['Political Party']
            if party == 'Democrat':
                party = 'Democratic'

            if party:
                p.add_party(party)

            if councilman['Photo']:
                p.image = councilman['Photo']

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['url'],
                                     note='E-mail')

            if councilman['Web site']:
                p.add_link(councilman['Web site']['url'], note='web site')

            p.extras = {'Notes': councilman['Notes']}

            p.add_source(councilman['Person Name']['url'], note='web')

            for committee, _, _ in committees:
                committee_name = committee['Department Name']['label']
                if committee_name not in noncommittees and 'committee' in committee_name.lower(
                ):
                    o = committee_d.get(committee_name, None)
                    if o is None:
                        parent_id = PARENT_ORGS.get(committee_name,
                                                    'New York City Council')
                        o = Organization(committee_name,
                                         classification='committee',
                                         parent_id={'name': parent_id})
                        o.add_source(committee['Department Name']['url'])
                        committee_d[committee_name] = o

                    membership = o.add_member(p, role=committee["Title"])
                    membership.start_date = self.mdY2Ymd(
                        committee["Start Date"])
            yield p

        for o in committee_d.values():
            if 'Committee' in o.name:
                yield o

        for o in committee_d.values():
            if 'Subcommittee' in o.name:
                yield o

        o = Organization(
            'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services',
            classification='committee',
            parent_id={'name': 'New York City Council'})
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o

        o = Organization(
            'Subcommittee on Drug Abuse',
            classification='committee',
            parent_id={
                'name':
                'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services'
            })
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o
    def scrape(self):
        noncommittees = {'Committee of the Whole'}
        committee_d = {}

        people_d = {}

        for councilman, committees in self.councilMembers() :

            
            if 'url' in councilman['Person Name'] :
                councilman_url = councilman['Person Name']['url']

                if councilman_url in people_d :
                    people_d[councilman_url][0].append(councilman) 
                else :
                    people_d[councilman_url] = [councilman], committees

        for person_entries, committees in people_d.values() :

            councilman = person_entries[-1]
            
            p = Person(councilman['Person Name']['label'])
            
            if p.name == 'Letitia James' :
                p.name = 'Letitia Ms. James'
                p.add_name('Letitia James')

            spans = [(self.toTime(entry['Start Date']).date(), 
                      self.toTime(entry['End Date']).date(),
                      entry['District'])
                     for entry in person_entries]

            merged_spans = []
            last_end_date = None
            last_district = None
            for start_date, end_date, district in sorted(spans) :
                if last_end_date is None :
                    span = [start_date, end_date, district]
                elif (start_date - last_end_date) == datetime.timedelta(1) and district == last_district :
                    span[1] = end_date
                else :
                    merged_spans.append(span)
                    span = [start_date, end_date, district]

                last_end_date = end_date
                last_district = district

            merged_spans.append(span)

            for start_date, end_date, district in merged_spans :
                district = councilman['District'].replace(' 0', ' ')
                if end_date == datetime.date(2017, 12, 31) :
                    end_date = ''
                else :
                    end_date = end_date.isoformat()
                print(start_date, end_date)
                p.add_term('Council Member', 'legislature', 
                           district=district, 
                           start_date=start_date.isoformat(),
                           end_date=end_date)

            party = councilman['Political Party']
            if party == 'Democrat' :
                party = 'Democratic'
            
            if party :
                p.add_party(party)

            if councilman['Photo'] :
                p.image = councilman['Photo']

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['url'],
                                     note='E-mail')

            if councilman['Web site']:
                p.add_link(councilman['Web site']['url'], note='web site')

            p.extras = {'Notes' : councilman['Notes']}
                 
            p.add_source(councilman['Person Name']['url'], note='web')

            for committee, _, _ in committees:
                committee_name = committee['Department Name']['label']
                if committee_name not in noncommittees and 'committee' in committee_name.lower():
                    o = committee_d.get(committee_name, None)
                    if o is None:
                        parent_id = PARENT_ORGS.get(committee_name,
                                                    'New York City Council')
                        o = Organization(committee_name,
                                         classification='committee',
                                         parent_id={'name' : parent_id})
                        o.add_source(committee['Department Name']['url'])
                        committee_d[committee_name] = o

                    membership = o.add_member(p, role=committee["Title"])
                    membership.start_date = self.mdY2Ymd(committee["Start Date"])
            yield p
            

        for o in committee_d.values() :
            if 'Committee' in o.name :
                yield o

        for o in committee_d.values() :
            if 'Subcommittee' in o.name :
                yield o

        o = Organization('Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services',
                         classification='committee',
                         parent_id={'name' : 'New York City Council'})
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o

        o = Organization('Subcommittee on Drug Abuse',
                         classification='committee',
                         parent_id={'name' : 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services'})
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o