Пример #1
0
    def get_member(self, session, chamber, kpid):
        url = "%smembers/%s" % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)["content"]

        party = content["PARTY"]
        if party == "Democrat":
            party = "Democratic"

        slug = {
            "2013-2014": "b2013_14",
            "2015-2016": "b2015_16",
            "2017-2018": "b2017_18",
            "2019-2020": "b2019_20",
        }[session]
        leg_url = "http://www.kslegislature.org/li/%s/members/%s/" % (slug,
                                                                      kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            (photo_url,
             ) = legislator_page.xpath('//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(
                content["FULLNAME"]))
            leg_url = ""
            photo_url = ""

        person = Person(
            name=content["FULLNAME"],
            district=str(content["DISTRICT"]),
            primary_org=chamber,
            party=party,
            image=photo_url,
        )
        person.extras = {"occupation": content["OCCUPATION"]}

        address = "\n".join([
            "Room {}".format(content["OFFICENUM"]),
            "Kansas State Capitol Building",
            "300 SW 10th St.",
            "Topeka, KS 66612",
        ])

        note = "Capitol Office"
        person.add_contact_detail(type="address", value=address, note=note)
        person.add_contact_detail(type="email",
                                  value=content["EMAIL"],
                                  note=note)
        if content["OFFPH"]:
            person.add_contact_detail(type="voice",
                                      value=content["OFFPH"],
                                      note=note)

        person.add_source(url)
        person.add_link(leg_url)

        yield person
Пример #2
0
    def get_member(self, session, chamber, kpid):
        url = '%smembers/%s' % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)['content']

        party = content['PARTY']
        if party == 'Democrat':
            party = 'Democratic'

        slug = {
            '2013-2014': 'b2013_14',
            '2015-2016': 'b2015_16',
            '2017-2018': 'b2017_18'
        }[session]
        leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug,
                                                                      kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            photo_url, = legislator_page.xpath(
                '//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(
                content['FULLNAME']))
            leg_url = ''
            photo_url = ''

        person = Person(
            name=content['FULLNAME'],
            district=str(content['DISTRICT']),
            primary_org=chamber,
            party=party,
            image=photo_url,
        )
        person.extras = {'occupation': content['OCCUPATION']}

        address = '\n'.join([
            'Room {}'.format(content['OFFICENUM']),
            'Kansas State Capitol Building',
            '300 SW 10th St.',
            'Topeka, KS 66612',
        ])

        note = 'Capitol Office'
        person.add_contact_detail(type='address', value=address, note=note)
        person.add_contact_detail(type='email',
                                  value=content['EMAIL'],
                                  note=note)
        if content['OFFPH']:
            person.add_contact_detail(type='voice',
                                      value=content['OFFPH'],
                                      note=note)

        person.add_source(url)
        person.add_link(leg_url)

        yield person
Пример #3
0
    def get_member(self, session, chamber, kpid):
        url = '%smembers/%s' % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)['content']

        party = content['PARTY']
        if party == 'Democrat':
            party = 'Democratic'

        slug = {'2013-2014': 'b2013_14',
                '2015-2016': 'b2015_16',
                '2017-2018': 'b2017_18',
                '2019-2020': 'b2019_20',
                }[session]
        leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug, kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            photo_url, = legislator_page.xpath(
                '//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(content['FULLNAME']))
            leg_url = ''
            photo_url = ''

        person = Person(
            name=content['FULLNAME'],
            district=str(content['DISTRICT']),
            primary_org=chamber,
            party=party,
            image=photo_url,
        )
        person.extras = {'occupation': content['OCCUPATION']}

        address = '\n'.join([
            'Room {}'.format(content['OFFICENUM']),
            'Kansas State Capitol Building',
            '300 SW 10th St.',
            'Topeka, KS 66612',
        ])

        note = 'Capitol Office'
        person.add_contact_detail(type='address', value=address, note=note)
        person.add_contact_detail(type='email', value=content['EMAIL'], note=note)
        if content['OFFPH']:
            person.add_contact_detail(type='voice', value=content['OFFPH'], note=note)

        person.add_source(url)
        person.add_link(leg_url)

        yield person
    def _scrape_lower_chamber(self):
        self.info('Scraping lower chamber for legislators.')

        chamber = 'lower'

        roster_url = (self._reps_url)
        page = self.get(roster_url).text
        page = lxml.html.fromstring(page)
        # This is the ASP.net table container
        table_xpath = ('id("ContentPlaceHolder1_' 'gridMembers_DXMainTable")')
        table = page.xpath(table_xpath)[0]
        for tr in table.xpath('tr')[1:]:
            # If a given term hasn't occurred yet, then ignore it
            # Eg, in 2017, the 2018 term page will have a blank table
            if tr.attrib.get('class') == 'dxgvEmptyDataRow':
                self.warning('No House members found')
                return

            tds = tr.xpath('td')
            last_name = tds[0].text_content().strip()
            first_name = tds[1].text_content().strip()
            full_name = '{} {}'.format(first_name, last_name)
            district = str(int(tds[2].text_content().strip()))
            party = tds[3].text_content().strip()
            if party == 'Democrat':
                party = 'Democratic'

            if party.strip() == "":  # Workaround for now.
                party = "Other"

            phone = tds[4].text_content().strip()
            room = tds[5].text_content().strip()
            address = self._assumed_address_fmt.format(room if room else '')

            if last_name == 'Vacant':
                person = Person(
                    name=full_name,
                    primary_org=chamber,
                    district=district,
                    party=party,
                )
                person.extras = {
                    'first_name': first_name,
                    'last_name': last_name,
                }

                person.add_contact_detail(type='address',
                                          value=address,
                                          note='Capitol Office')
                if phone.strip():
                    person.add_contact_detail(type='voice',
                                              value=phone,
                                              note='Capitol Office')

                person.add_source(roster_url)

                self._save_vacant_legislator(person)
            else:
                party_override = {
                    " Green": "Democratic",
                    " Sisco": "Republican"
                }

                if party == "" and full_name in party_override:
                    party = party_override[full_name]

                details_url = self._rep_details_url.format(district)
                details_page = lxml.html.fromstring(self.get(details_url).text)

                person = Person(
                    name=full_name,
                    primary_org=chamber,
                    district=district,
                    party=party,
                )
                person.extras = {
                    'first_name': first_name,
                    'last_name': last_name,
                }
                person.add_source(roster_url)
                person.add_source(details_url)
                person.add_link(details_url)

                email = details_page.xpath(
                    '//*[@id="ContentPlaceHolder1_lblAddresses"]'
                    '/table/tr[4]/td/a/@href')
                if len(email) > 0 and email[0].lower() != 'mailto:':
                    email = email[0].split(':')[1]
                else:
                    email = None

                person.add_contact_detail(type='address',
                                          value=address,
                                          note='Capitol Office')
                if phone:
                    person.add_contact_detail(type='voice',
                                              value=phone,
                                              note='Capitol Office')
                if email:
                    person.add_contact_detail(type='email',
                                              value=email,
                                              note='Capitol Office')

                picture = details_page.xpath(
                    '//*[@id="ContentPlaceHolder1_imgPhoto"]/@src')
                if len(picture) > 0:
                    person.image = picture[0]

                yield person
Пример #5
0
    def _parse_person(self, row, chamber, seat_map):
        # Capture legislator vitals.
        first_name = row['FirstName']
        middle_name = row['MiddleName']
        last_name = row['LastName']
        full_name = '{} {} {}'.format(first_name, middle_name, last_name)
        full_name = re.sub(r'[\s]{2,}', ' ', full_name)

        if chamber == 'lower':
            district = '{} {}'.format(row['County'], int(row['District'])).strip()
        else:
            district = str(int(row['District'])).strip()

        party = self.party_map[row['party'].upper()]
        email = row['WorkEmail']

        print(district)
        person = Person(primary_org=chamber,
                        district=district,
                        name=full_name,
                        party=party)

        extras = {
            'first_name': first_name,
            'middle_name': middle_name,
            'last_name': last_name
        }

        person.extras = extras
        if email:
            person.add_contact_detail(type='email', value=email, note='District Office')

        # Capture legislator office contact information.
        district_address = '{}\n{}\n{}, {} {}'.format(row['Address'],
                                                      row['address2'],
                                                      row['city'], row['State'],
                                                      row['Zipcode']).strip()

        phone = row['Phone'].strip()
        if not phone:
            phone = None

        if district_address:
            person.add_contact_detail(type='address', value=district_address, note='Home Office')
        if phone:
            person.add_contact_detail(type='voice', value=phone, note='Home Office')

        # Retrieve legislator portrait.
        profile_url = None
        if chamber == 'upper':
            profile_url = self.senate_profile_url.format(row['District'])
        elif chamber == 'lower':
            try:
                seat_number = seat_map[row['seatno']]
                profile_url = self.house_profile_url.format(seat_number)
            except KeyError:
                pass

        if profile_url:
            person.image = self._get_photo(profile_url, chamber)
            person.add_source(profile_url)

        return person
Пример #6
0
    def scrape(self):
        web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute)
        web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody'

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}

        for member, _ in web_scraper.councilMembers():
            name = member['Person Name']['label'].strip()
            web_info[name] = member

        city_council, = [body for body in self.bodies()
                         if body['BodyName'] == 'City Council']

        terms = collections.defaultdict(list)

        public_advocates = {  # Match casing to Bill De Blasio as council member
            'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio',
            'The Public Advocate (Ms. James)': 'Letitia James',
        }

        for office in self.body_offices(city_council):
            name = office['OfficeRecordFullName']
            name = public_advocates.get(name, name).strip()

            terms[name].append(office)

            # Add past members (and advocates public)
            if name not in web_info:
                web_info[name] = collections.defaultdict(lambda: None)

        # Check that we have everyone we expect, formatted consistently, in
        # both information arrays. For instance, this will fail if we forget to
        # strip trailing spaces from names on one side or the other (which has
        # the effect of omitting information, such as post, from the scrape).

        assert set(web_info.keys()) == set(terms.keys())

        members = {}

        for member, offices in terms.items():

            p = Person(member)

            web = web_info[member]

            for term in offices:
                role = term['OfficeRecordTitle']

                if role == 'Public Advocate':
                    role = 'Non-Voting Council Member'
                else:
                    role = 'Council Member'

                district = web.get('District', '').replace(' 0', ' ')

                p.add_term(role,
                           'legislature',
                           district=district,
                           start_date=self.toDate(term['OfficeRecordStartDate']),
                           end_date=self.toDate(term['OfficeRecordEndDate']))

                party = web.get('Political Party')

                if party == 'Democrat':
                    party = 'Democratic'

                if party:
                    p.add_party(party)

                if web.get('Photo'):
                    p.image = web['Photo']

                contact_types = {
                    "City Hall Office": ("address", "City Hall Office"),
                    "City Hall Phone": ("voice", "City Hall Phone"),
                    "Ward Office Phone": ("voice", "Ward Office Phone"),
                    "Ward Office Address": ("address", "Ward Office Address"),
                    "Fax": ("fax", "Fax")
                }

                for contact_type, (type_, _note) in contact_types.items():
                    if web.get(contact_type) and web(contact_type) != 'N/A':
                        p.add_contact_detail(type=type_,
                                             value= web[contact_type],
                                             note=_note)

                if web.get('E-mail'):
                    p.add_contact_detail(type="email",
                                         value=web['E-mail']['url'],
                                         note='E-mail')

                if web.get('Web site'):
                    p.add_link(web['Web site']['url'], note='web site')

                if web.get('Notes'):
                    p.extras = {'Notes': web['Notes']}

                if not p.sources:  # Only add sources once
                    source_urls = self.person_sources_from_office(term)
                    person_api_url, person_web_url = source_urls
                    p.add_source(person_api_url, note='api')
                    p.add_source(person_web_url, note='web')

            members[member] = p

        committee_types = ['Committee',
                           'Inactive Committee',
                           'Select Committee',
                           'Subcommittee',
                           'Task Force',
                           'Land Use', # Committee on Land Use
                          ]

        body_types = {k: v for k, v in self.body_types().items()
                      if k in committee_types}

        for body in self.bodies():
            if body['BodyTypeName'] in body_types \
                or body['BodyName'] in ('Legislative Documents Unit',
                                        'Legal and Government Affairs Division'):

                # Skip typo in API data
                if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services':
                    continue

                parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council')

                body_name = body['BodyName']

                o = Organization(body_name,
                                 classification='committee',
                                 parent_id={'name': parent_org})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web')

                for office in self.body_offices(body):
                    # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio',
                    # 'Committee Member', None, 'CHAIRPERSON'

                    role = office['OfficeRecordTitle']

                    if role and role.lower() == 'chairperson':
                        role = 'Chairperson'
                    else:
                        role = 'Member'

                    person = office['OfficeRecordFullName']
                    person = public_advocates.get(person, person).strip()

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(o,
                                     role=role,
                                     start_date=self.toDate(office['OfficeRecordStartDate']),
                                     end_date=self.toDate(office['OfficeRecordEndDate']))

                yield o

        for p in members.values():
            yield p
Пример #7
0
    def scrape(self):
        web_scraper = LegistarPersonScraper(
            requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody'

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}

        for member, _ in web_scraper.councilMembers():
            name = member['Person Name']['label'].strip()
            web_info[name] = member

        city_council, = [
            body for body in self.bodies()
            if body['BodyName'] == 'City Council'
        ]

        terms = collections.defaultdict(list)

        public_advocates = {  # Match casing to Bill De Blasio as council member
            'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio',
            'The Public Advocate (Ms. James)': 'Letitia James',
        }

        for office in self.body_offices(city_council):
            name = office['OfficeRecordFullName']
            name = public_advocates.get(name, name).strip()

            terms[name].append(office)

            # Add past members (and advocates public)
            if name not in web_info:
                web_info[name] = collections.defaultdict(lambda: None)

        # Check that we have everyone we expect, formatted consistently, in
        # both information arrays. For instance, this will fail if we forget to
        # strip trailing spaces from names on one side or the other (which has
        # the effect of omitting information, such as post, from the scrape).

        assert set(web_info.keys()) == set(terms.keys())

        members = {}

        for member, offices in terms.items():

            p = Person(member)

            web = web_info[member]

            for term in offices:
                role = term['OfficeRecordTitle']

                if role == 'Public Advocate':
                    role = 'Non-Voting Council Member'
                else:
                    role = 'Council Member'

                district = web.get('District', '').replace(' 0', ' ')

                p.add_term(role,
                           'legislature',
                           district=district,
                           start_date=self.toDate(
                               term['OfficeRecordStartDate']),
                           end_date=self.toDate(term['OfficeRecordEndDate']))

                party = web.get('Political Party')

                if party == 'Democrat':
                    party = 'Democratic'

                if party:
                    p.add_party(party)

                if web.get('Photo'):
                    p.image = web['Photo']

                contact_types = {
                    "City Hall Office": ("address", "City Hall Office"),
                    "City Hall Phone": ("voice", "City Hall Phone"),
                    "Ward Office Phone": ("voice", "Ward Office Phone"),
                    "Ward Office Address": ("address", "Ward Office Address"),
                    "Fax": ("fax", "Fax")
                }

                for contact_type, (type_, _note) in contact_types.items():
                    if web.get(contact_type) and web(contact_type) != 'N/A':
                        p.add_contact_detail(type=type_,
                                             value=web[contact_type],
                                             note=_note)

                if web.get('E-mail'):
                    p.add_contact_detail(type="email",
                                         value=web['E-mail']['url'],
                                         note='E-mail')

                if web.get('Web site'):
                    p.add_link(web['Web site']['url'], note='web site')

                if web.get('Notes'):
                    p.extras = {'Notes': web['Notes']}

                if not p.sources:  # Only add sources once
                    source_urls = self.person_sources_from_office(term)
                    person_api_url, person_web_url = source_urls
                    p.add_source(person_api_url, note='api')
                    p.add_source(person_web_url, note='web')

            members[member] = p

        committee_types = [
            'Committee', 'Inactive Committee', 'Select Committee',
            'Subcommittee', 'Task Force', 'Land Use'
        ]  # Committee on Land Use

        body_types = {
            k: v
            for k, v in self.body_types().items() if k in committee_types
        }

        for body in self.bodies():
            if body['BodyTypeName'] in body_types \
                or body['BodyName'] in ('Legislative Documents Unit',
                                        'Legal and Government Affairs Division'):

                # Skip typo in API data
                if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services':
                    continue

                parent_org = PARENT_ORGS.get(body['BodyName'],
                                             'New York City Council')

                body_name = body['BodyName']

                o = Organization(body_name,
                                 classification='committee',
                                 parent_id={'name': parent_org})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(
                    self.WEB_URL +
                    '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.
                    format(**body),
                    note='web')

                for office in self.body_offices(body):
                    # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio',
                    # 'Committee Member', None, 'CHAIRPERSON'

                    role = office['OfficeRecordTitle']

                    if role and role.lower() == 'chairperson':
                        role = 'Chairperson'
                    else:
                        role = 'Member'

                    person = office['OfficeRecordFullName']
                    person = public_advocates.get(person, person).strip()

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(o,
                                     role=role,
                                     start_date=self.toDate(
                                         office['OfficeRecordStartDate']),
                                     end_date=self.toDate(
                                         office['OfficeRecordEndDate']))

                yield o

        for p in members.values():
            yield p
    def scrape(self):
        noncommittees = {'Committee of the Whole'}
        committee_d = {}

        people_d = {}

        for councilman, committees in self.councilMembers() :

            
            if 'url' in councilman['Person Name'] :
                councilman_url = councilman['Person Name']['url']

                if councilman_url in people_d :
                    people_d[councilman_url][0].append(councilman) 
                else :
                    people_d[councilman_url] = [councilman], committees

        for person_entries, committees in people_d.values() :

            councilman = person_entries[-1]
            
            p = Person(councilman['Person Name']['label'])
            
            if p.name == 'Letitia James' :
                p.name = 'Letitia Ms. James'
                p.add_name('Letitia James')

            spans = [(self.toTime(entry['Start Date']).date(), 
                      self.toTime(entry['End Date']).date(),
                      entry['District'])
                     for entry in person_entries]

            merged_spans = []
            last_end_date = None
            last_district = None
            for start_date, end_date, district in sorted(spans) :
                if last_end_date is None :
                    span = [start_date, end_date, district]
                elif (start_date - last_end_date) == datetime.timedelta(1) and district == last_district :
                    span[1] = end_date
                else :
                    merged_spans.append(span)
                    span = [start_date, end_date, district]

                last_end_date = end_date
                last_district = district

            merged_spans.append(span)

            for start_date, end_date, district in merged_spans :
                district = councilman['District'].replace(' 0', ' ')
                if end_date == datetime.date(2017, 12, 31) :
                    end_date = ''
                else :
                    end_date = end_date.isoformat()
                print(start_date, end_date)
                p.add_term('Council Member', 'legislature', 
                           district=district, 
                           start_date=start_date.isoformat(),
                           end_date=end_date)

            party = councilman['Political Party']
            if party == 'Democrat' :
                party = 'Democratic'
            
            if party :
                p.add_party(party)

            if councilman['Photo'] :
                p.image = councilman['Photo']

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['url'],
                                     note='E-mail')

            if councilman['Web site']:
                p.add_link(councilman['Web site']['url'], note='web site')

            p.extras = {'Notes' : councilman['Notes']}
                 
            p.add_source(councilman['Person Name']['url'], note='web')

            for committee, _, _ in committees:
                committee_name = committee['Department Name']['label']
                if committee_name not in noncommittees and 'committee' in committee_name.lower():
                    o = committee_d.get(committee_name, None)
                    if o is None:
                        parent_id = PARENT_ORGS.get(committee_name,
                                                    'New York City Council')
                        o = Organization(committee_name,
                                         classification='committee',
                                         parent_id={'name' : parent_id})
                        o.add_source(committee['Department Name']['url'])
                        committee_d[committee_name] = o

                    membership = o.add_member(p, role=committee["Title"])
                    membership.start_date = self.mdY2Ymd(committee["Start Date"])
            yield p
            

        for o in committee_d.values() :
            if 'Committee' in o.name :
                yield o

        for o in committee_d.values() :
            if 'Subcommittee' in o.name :
                yield o

        o = Organization('Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services',
                         classification='committee',
                         parent_id={'name' : 'New York City Council'})
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o

        o = Organization('Subcommittee on Drug Abuse',
                         classification='committee',
                         parent_id={'name' : 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services'})
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o
Пример #9
0
    def scrape_session(self, session, chambers):
        sid = SESSION_SITE_IDS[session]
        members = backoff(
            self.sservice.GetMembersBySession,
            sid
        )['MemberListing']

        for member in members:
            guid = member['Id']
            member_info = backoff(self.sservice.GetMember, guid)

            # Check to see if the member has vacated; skip if so:
            try:
                legislative_service = next(
                    service for service
                    in member_info['SessionsInService']['LegislativeService']
                    if service['Session']['Id'] == sid
                )
            except IndexError:
                raise Exception("Something very bad is going on with the "
                                "Legislative service")

            if legislative_service['DateVacated']:
                continue

            nick_name, first_name, middle_name, last_name = (
                member_info['Name'][x] for x in [
                    'Nickname', 'First', 'Middle', 'Last'
                ]
            )

            first_name = nick_name if nick_name else first_name

            if middle_name:
                full_name = "%s %s %s" % (first_name, middle_name, last_name)
            else:
                full_name = "%s %s" % (first_name, last_name)

            party = legislative_service['Party']

            if party == 'Democrat':
                party = 'Democratic'

            elif party.strip() == '':
                party = 'other'

            chamber, district = (
                legislative_service['District'][x] for x in [
                    'Type', 'Number'
                ]
            )

            chamber = {
                "House": 'lower',
                "Senate": 'upper'
            }[chamber]

            url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber],
                                              {"code": guid, "sid": sid})

            legislator = Person(
                name=full_name,
                district=str(district),
                party=party,
                primary_org=chamber,
                image=photo,
            )
            legislator.extras = {
                'last_name': last_name,
                'first_name': first_name,
                'guid': guid,
            }

            capitol_address = self.clean_list([
                member_info['Address'][x] for x in [
                    'Street', 'City', 'State', 'Zip'
                ]
            ])

            capitol_address = " ".join(
                addr_component for addr_component
                in capitol_address if addr_component
            ).strip()

            capitol_contact_info = self.clean_list([
                member_info['Address'][x] for x in [
                    'Email', 'Phone', 'Fax'
                ]
            ])

            # Sometimes email is set to a long cryptic string.
            # If it doesn't have a @ character, simply set it to None
            # examples:
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=xT8jBs5X4S7ZX2TOajTx2W7CBprTaVlpcvUvHEv78GI=
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=eSH9vpfdy3XJ989Gpw4MOdUa3n55NTA8ev58RPJuzA8=

            if capitol_contact_info[0] and '@' not in capitol_contact_info[0]:
                capitol_contact_info[0] = None

            # if we have more than 2 chars (eg state)
            # or a phone/fax/email address record the info
            if len(capitol_address) > 2 or not capitol_contact_info.count(None) == 3:
                if capitol_contact_info[0] and '*****@*****.**' in capitol_contact_info[0]:
                    self.warning("XXX: GA SITE WAS HACKED.")
                    capitol_contact_info[1] = None

                if capitol_address.strip():
                    legislator.add_contact_detail(
                        type='address', value=capitol_address, note='Capitol Address')
                if capitol_contact_info[1]:
                    legislator.add_contact_detail(
                        type='voice', value=capitol_contact_info[1], note='Capitol Address')
                if capitol_contact_info[2]:
                    legislator.add_contact_detail(
                        type='fax', value=capitol_contact_info[2], note='Capitol Address')
                if capitol_contact_info[0]:
                    legislator.add_contact_detail(
                        type='email', value=capitol_contact_info[0], note='Capitol Address')

            district_address = self.clean_list([
                member_info['DistrictAddress'][x] for x in [
                    'Street', 'City', 'State', 'Zip'
                ]
            ])

            district_contact_info = self.clean_list([
                member_info['DistrictAddress'][x] for x in [
                    'Email', 'Phone', 'Fax'
                ]
            ])

            # Same issue with district email. See above comment
            if district_contact_info[0] and '@' not in district_contact_info[0]:
                district_contact_info[0] = None

            district_address = " ".join(
                addr_component for addr_component
                in district_address if addr_component
            ).strip()

            if len(capitol_address) > 2 or not capitol_contact_info.count(None) == 3:
                if (district_contact_info[1] and
                        '*****@*****.**' in district_contact_info[1]):
                    self.warning("XXX: GA SITE WAS HACKED.")
                    district_contact_info[1] = None

                if district_address.strip():
                    legislator.add_contact_detail(
                        type='address', value=district_address, note='District Address')
                if district_contact_info[1]:
                    legislator.add_contact_detail(
                        type='voice', value=district_contact_info[1], note='District Address')
                if district_contact_info[2]:
                    legislator.add_contact_detail(
                        type='fax', value=district_contact_info[2], note='District Address')
                if district_contact_info[0]:
                    legislator.add_contact_detail(
                        type='email', value=district_contact_info[0], note='District Address')

            legislator.add_link(url)
            legislator.add_source(self.ssource)
            legislator.add_source(HOMEPAGE_URLS[chamber].format(
                **{"code": guid, "sid": sid}))

            yield legislator
Пример #10
0
    def _parse_person(self, row, chamber, seat_map):
        # Capture legislator vitals.
        first_name = row['FirstName']
        middle_name = row['MiddleName']
        last_name = row['LastName']
        full_name = '{} {} {}'.format(first_name, middle_name, last_name)
        full_name = re.sub(r'[\s]{2,}', ' ', full_name)

        if chamber == 'lower':
            district = '{} {}'.format(row['County'],
                                      int(row['District'])).strip()
        else:
            district = str(int(row['District'])).strip()

        party = self.party_map[row['party'].upper()]
        email = row['WorkEmail']

        if district == '0':
            self.warning('Skipping {}, district is set to 0'.format(full_name))
            return

        # Temporary fix for Kari Lerner
        if district == 'Rockingham 0' and last_name == 'Lerner':
            district = 'Rockingham 4'

        person = Person(primary_org=chamber,
                        district=district,
                        name=full_name,
                        party=party)

        extras = {
            'first_name': first_name,
            'middle_name': middle_name,
            'last_name': last_name
        }

        person.extras = extras
        if email:
            person.add_contact_detail(type='email',
                                      value=email,
                                      note='District Office')

        # Capture legislator office contact information.
        district_address = '{}\n{}\n{}, {} {}'.format(row['Address'],
                                                      row['address2'],
                                                      row['city'],
                                                      row['State'],
                                                      row['Zipcode']).strip()

        phone = row['Phone'].strip()
        if not phone:
            phone = None

        if district_address:
            person.add_contact_detail(type='address',
                                      value=district_address,
                                      note='Home Office')
        if phone:
            person.add_contact_detail(type='voice',
                                      value=phone,
                                      note='Home Office')

        # Retrieve legislator portrait.
        profile_url = None
        if chamber == 'upper':
            profile_url = self.senate_profile_url.format(row['District'])
        elif chamber == 'lower':
            try:
                seat_number = seat_map[row['seatno']]
                profile_url = self.house_profile_url.format(seat_number)
            except KeyError:
                pass

        if profile_url:
            person.image = self._get_photo(profile_url, chamber)
            person.add_source(profile_url)

        return person
Пример #11
0
    def _parse_person(self, row, chamber, seat_map):
        # Capture legislator vitals.
        first_name = row["FirstName"]
        middle_name = row["MiddleName"]
        last_name = row["LastName"]
        full_name = "{} {} {}".format(first_name, middle_name, last_name)
        full_name = re.sub(r"[\s]{2,}", " ", full_name)

        if chamber == "lower":
            district = "{} {}".format(row["County"],
                                      int(row["District"])).strip()
        else:
            district = str(int(row["District"])).strip()

        party = self.party_map[row["party"].upper()]
        email = row["WorkEmail"]

        if district == "0":
            self.warning("Skipping {}, district is set to 0".format(full_name))
            return

        person = Person(primary_org=chamber,
                        district=district,
                        name=full_name,
                        party=party)

        extras = {
            "first_name": first_name,
            "middle_name": middle_name,
            "last_name": last_name,
        }

        person.extras = extras
        if email:
            office = "Capitol" if email.endswith(
                "@leg.state.nh.us") else "District"
            person.add_contact_detail(type="email",
                                      value=email,
                                      note=office + " Office")

        # Capture legislator office contact information.
        district_address = "{}\n{}\n{}, {} {}".format(row["Address"],
                                                      row["address2"],
                                                      row["city"],
                                                      row["State"],
                                                      row["Zipcode"]).strip()

        phone = row["Phone"].strip()
        if not phone:
            phone = None

        if district_address:
            office = "Capitol" if chamber == "upper" else "District"
            person.add_contact_detail(type="address",
                                      value=district_address,
                                      note=office + " Office")
        if phone:
            office = "Capitol" if "271-" in phone else "District"
            person.add_contact_detail(type="voice",
                                      value=phone,
                                      note=office + " Office")

        # Retrieve legislator portrait.
        profile_url = None
        if chamber == "upper":
            profile_url = self.senate_profile_url.format(row["District"])
        elif chamber == "lower":
            try:
                seat_number = seat_map[row["seatno"]]
                profile_url = self.house_profile_url.format(seat_number)
            except KeyError:
                pass

        if profile_url:
            person.image = self._get_photo(profile_url, chamber)
            person.add_source(profile_url)

        return person
Пример #12
0
    def scrape_session(self, session, chambers):
        sid = SESSION_SITE_IDS[session]
        members = backoff(self.sservice.GetMembersBySession,
                          sid)['MemberListing']

        for member in members:
            guid = member['Id']
            member_info = backoff(self.sservice.GetMember, guid)

            # Check to see if the member has vacated; skip if so:
            try:
                legislative_service = next(
                    service for service in member_info['SessionsInService']
                    ['LegislativeService'] if service['Session']['Id'] == sid)
            except IndexError:
                raise Exception("Something very bad is going on with the "
                                "Legislative service")

            if legislative_service['DateVacated']:
                continue

            nick_name, first_name, middle_name, last_name = (
                member_info['Name'][x]
                for x in ['Nickname', 'First', 'Middle', 'Last'])

            first_name = nick_name if nick_name else first_name

            if middle_name:
                full_name = "%s %s %s" % (first_name, middle_name, last_name)
            else:
                full_name = "%s %s" % (first_name, last_name)

            party = legislative_service['Party']

            if party == 'Democrat':
                party = 'Democratic'

            elif party.strip() == '':
                party = 'other'

            chamber, district = (legislative_service['District'][x]
                                 for x in ['Type', 'Number'])

            chamber = {"House": 'lower', "Senate": 'upper'}[chamber]

            url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber], {
                "code": guid,
                "sid": sid
            })

            legislator = Person(
                name=full_name,
                district=str(district),
                party=party,
                primary_org=chamber,
                image=photo,
            )
            legislator.extras = {
                'last_name': last_name,
                'first_name': first_name,
                'guid': guid,
            }

            capitol_address = self.clean_list([
                member_info['Address'][x]
                for x in ['Street', 'City', 'State', 'Zip']
            ])

            capitol_address = " ".join(addr_component
                                       for addr_component in capitol_address
                                       if addr_component).strip()

            capitol_contact_info = self.clean_list(
                [member_info['Address'][x] for x in ['Email', 'Phone', 'Fax']])

            # Sometimes email is set to a long cryptic string.
            # If it doesn't have a @ character, simply set it to None
            # examples:
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=xT8jBs5X4S7ZX2TOajTx2W7CBprTaVlpcvUvHEv78GI=
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=eSH9vpfdy3XJ989Gpw4MOdUa3n55NTA8ev58RPJuzA8=

            if capitol_contact_info[0] and '@' not in capitol_contact_info[0]:
                capitol_contact_info[0] = None

            # if we have more than 2 chars (eg state)
            # or a phone/fax/email address record the info
            if len(capitol_address) > 2 or not capitol_contact_info.count(
                    None) == 3:
                if capitol_contact_info[
                        0] and '*****@*****.**' in capitol_contact_info[
                            0]:
                    self.warning("XXX: GA SITE WAS HACKED.")
                    capitol_contact_info[1] = None

                if capitol_address.strip():
                    legislator.add_contact_detail(type='address',
                                                  value=capitol_address,
                                                  note='Capitol Address')
                if capitol_contact_info[1]:
                    legislator.add_contact_detail(
                        type='voice',
                        value=capitol_contact_info[1],
                        note='Capitol Address')
                if capitol_contact_info[2]:
                    legislator.add_contact_detail(
                        type='fax',
                        value=capitol_contact_info[2],
                        note='Capitol Address')
                if capitol_contact_info[0]:
                    legislator.add_contact_detail(
                        type='email',
                        value=capitol_contact_info[0],
                        note='Capitol Address')

            district_address = self.clean_list([
                member_info['DistrictAddress'][x]
                for x in ['Street', 'City', 'State', 'Zip']
            ])

            district_contact_info = self.clean_list([
                member_info['DistrictAddress'][x]
                for x in ['Email', 'Phone', 'Fax']
            ])

            # Same issue with district email. See above comment
            if district_contact_info[0] and '@' not in district_contact_info[0]:
                district_contact_info[0] = None

            district_address = " ".join(addr_component
                                        for addr_component in district_address
                                        if addr_component).strip()

            if len(capitol_address) > 2 or not capitol_contact_info.count(
                    None) == 3:
                if (district_contact_info[1] and '*****@*****.**'
                        in district_contact_info[1]):
                    self.warning("XXX: GA SITE WAS HACKED.")
                    district_contact_info[1] = None

                if district_address.strip():
                    legislator.add_contact_detail(type='address',
                                                  value=district_address,
                                                  note='District Address')
                if district_contact_info[1]:
                    legislator.add_contact_detail(
                        type='voice',
                        value=district_contact_info[1],
                        note='District Address')
                if district_contact_info[2]:
                    legislator.add_contact_detail(
                        type='fax',
                        value=district_contact_info[2],
                        note='District Address')
                if district_contact_info[0]:
                    legislator.add_contact_detail(
                        type='email',
                        value=district_contact_info[0],
                        note='District Address')

            legislator.add_link(url)
            legislator.add_source(self.ssource)
            legislator.add_source(HOMEPAGE_URLS[chamber].format(**{
                "code": guid,
                "sid": sid
            }))

            yield legislator
Пример #13
0
    def transform_parse(self, parsed_form, response):

        _source = {
            "url": response.url,
            "note": "LDA Form LD-1"
        }

        # basic disclosure fields
        _disclosure = Disclosure(
            effective_date=datetime.strptime(
                parsed_form['datetimes']['effective_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            timezone='America/New_York',
            submitted_date=datetime.strptime(
                parsed_form['datetimes']['signature_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            classification="lobbying"
        )

        _disclosure.add_authority(name=self.authority.name,
                                  type=self.authority._type,
                                  id=self.authority._id)

        _disclosure.add_identifier(
            identifier=parsed_form['_meta']['document_id'],
            scheme="urn:sopr:filing"
        )

        # disclosure extras
        _disclosure.extras = {}
        _disclosure.extras['registrant'] = {
            'self_employed_individual': parsed_form['registrant']['self_employed_individual'],
            'general_description': parsed_form['registrant']['registrant_general_description'],
            'signature': {
                "signature_date": parsed_form['datetimes']['signature_date'],
                "signature": parsed_form['signature']
            }
        }

        _disclosure.extras['client'] = {
            'same_as_registrant':
                parsed_form['client']['client_self'],
            'general_description':
                parsed_form['client']['client_general_description']
        }

        _disclosure.extras['registration_type'] = {
            'is_amendment':
                parsed_form['registration_type']['is_amendment'],
            'new_registrant':
                parsed_form['registration_type']['new_registrant'],
            'new_client_for_existing_registrant':
                parsed_form['registration_type'][
                    'new_client_for_existing_registrant'],
        }

        # # Registrant
        # build registrant
        _registrant_self_employment = None

        if parsed_form['registrant']['self_employed_individual']:
            n = ' '.join([p for p in [
                parsed_form['registrant']['registrant_individual_prefix'],
                parsed_form['registrant']['registrant_individual_firstname'],
                parsed_form['registrant']['registrant_individual_lastname']
            ] if len(p) > 0]).strip()

            _registrant = Person(
                name=n,
                source_identified=True
            )

            _registrant_self_employment = Organization(
                name='SELF-EMPLOYMENT of {n}'.format(n=n),
                classification='company',
                source_identified=True
            )

            _registrant.add_membership(
                organization=_registrant_self_employment,
                role='self_employed',
                label='self-employment of {n}'.format(n=n),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )
        else:
            _registrant = Organization(
                name=parsed_form['registrant']['registrant_org_name'],
                classification='company',
                source_identified=True
            )

        if len(parsed_form['registrant']['registrant_house_id']) > 0:
            _registrant.add_identifier(
                identifier=parsed_form['registrant']['registrant_house_id'],
                scheme='urn:house_clerk:registrant'
            )

        if len(parsed_form['registrant']['registrant_senate_id']) > 0:
            _registrant.add_identifier(
                identifier=parsed_form['registrant']['registrant_senate_id'],
                scheme='urn:sopr:registrant'
            )

        registrant_contact_details = [
            {
                "type": "address",
                "note": "contact address",
                "value": '; '.join([
                    p for p in [
                        parsed_form['registrant']['registrant_address_one'],
                        parsed_form['registrant']['registrant_address_two'],
                        parsed_form['registrant']['registrant_city'],
                        parsed_form['registrant']['registrant_state'],
                        parsed_form['registrant']['registrant_zip'],
                        parsed_form['registrant']['registrant_country']]
                    if len(p) > 0]).strip(),
            },
            {
                "type": "voice",
                "note": "contact phone",
                "value": parsed_form['registrant']['registrant_contact_phone'],
            },
            {
                "type": "email",
                "note": "contact email",
                "value": parsed_form['registrant']['registrant_contact_email'],
            },
        ]

        registrant_contact_ppb = {
            "type": "address",
            "note": "principal place of business",
            "value": '; '.join([
                p for p in [
                    parsed_form['registrant']['registrant_ppb_city'],
                    parsed_form['registrant']['registrant_ppb_state'],
                    parsed_form['registrant']['registrant_ppb_zip'],
                    parsed_form['registrant']['registrant_ppb_country']]
                if len(p) > 0]).strip(),
        }

        if registrant_contact_ppb["value"]:
            registrant_contact_details.append(registrant_contact_ppb)

        for cd in registrant_contact_details:
            _registrant.add_contact_detail(**cd)

        _registrant.extras = {
            "contact_details_structured": [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address_one",
                            "value": parsed_form['registrant'][
                                'registrant_address_one'],
                        },
                        {
                            "note": "address_two",
                            "value": parsed_form['registrant'][
                                'registrant_address_two'],
                        },
                        {
                            "note": "city",
                            "value": parsed_form['registrant'][
                                'registrant_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['registrant'][
                                'registrant_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['registrant'][
                                'registrant_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['registrant'][
                                'registrant_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_country'],
                        }
                    ],
                },
            ]
        }

        # # People
        # build contact
        _main_contact = Person(
            name=parsed_form['registrant']['registrant_contact_name'],
            source_identified=True
        )

        main_contact_contact_details = [
            {
                "type": "voice",
                "note": "contact phone",
                "value": parsed_form['registrant']['registrant_contact_phone'],
            },
            {
                "type": "email",
                "note": "contact email",
                "value": parsed_form['registrant']['registrant_contact_email'],
            }
        ]

        for cd in main_contact_contact_details:
            _main_contact.add_contact_detail(**cd)

        if _registrant._type == 'organization':
            _registrant.add_member(
                name_or_person=_main_contact,
                role='main_contact',
                label='main contact for {n}'.format(n=_registrant.name),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )
        else:
            _registrant_self_employment.add_member(
                name_or_person=_main_contact,
                role='main_contact',
                label='main contact for {n}'.format(n=_registrant.name),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )

        # # Client
        # build client
        _client = Organization(
            name=parsed_form['client']['client_name'],
            classification='company',
            source_identified=True
        )

        client_contact_details = [
            {
                "type": "address",
                "note": "contact address",
                "value": '; '.join([
                    p for p in [
                        parsed_form['client']['client_address'],
                        parsed_form['client']['client_city'],
                        parsed_form['client']['client_state'],
                        parsed_form['client']['client_zip'],
                        parsed_form['client']['client_country']]
                    if len(p) > 0]).strip(),
            },
        ]

        client_contact_ppb = {
            "type": "address",
            "note": "principal place of business",
            "value": '; '.join([
                p for p in [
                    parsed_form['client']['client_ppb_city'],
                    parsed_form['client']['client_ppb_state'],
                    parsed_form['client']['client_ppb_zip'],
                    parsed_form['client']['client_ppb_country']]
                if len(p) > 0]).strip(),
        }

        if client_contact_ppb["value"]:
            client_contact_details.append(client_contact_ppb)

        for cd in client_contact_details:
            _client.add_contact_detail(**cd)

        _client.extras = {
            "contact_details_structured": [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": parsed_form['client']['client_address'],
                        },
                        {
                            "note": "city",
                            "value": parsed_form['client']['client_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['client']['client_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['client']['client_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['client']['client_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value": parsed_form['client']['client_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['client']['client_ppb_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['client']['client_ppb_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['client'][
                                'client_ppb_country'],
                        }
                    ],
                },
            ],
        }

        # Collect Foreign Entities
        _foreign_entities = []
        _foreign_entities_by_name = {}
        for fe in parsed_form['foreign_entities']:
            fe_extras = {}
            fe_name = fe['foreign_entity_name']

            # check for name-based duplicates
            if fe_name in _foreign_entities_by_name:
                _foreign_entity = _foreign_entities_by_name[fe_name]
            else:
                _foreign_entity = Organization(
                    name=fe_name,
                    classification='company',
                    source_identified=True
                )

            # collect contact details
            foreign_entity_contact_details = [
                {
                    "type": "address",
                    "note": "contact address",
                    "value": '; '.join([
                        p for p in [
                            fe['foreign_entity_address'],
                            fe['foreign_entity_city'],
                            fe['foreign_entity_state'],
                            fe['foreign_entity_country']]
                        if len(p) > 0]).strip(),
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "value": '; '.join([
                        p for p in [
                            fe['foreign_entity_ppb_state'],
                            fe['foreign_entity_ppb_country']]
                        if len(p) > 0]).strip(),
                },
            ]

            foreign_entity_contact_ppb = {
                "type": "address",
                "note": "principal place of business",
                "value": '; '.join([
                    p for p in [
                        fe['foreign_entity_ppb_city'],
                        fe['foreign_entity_ppb_state'],
                        fe['foreign_entity_ppb_country']]
                    if len(p) > 0]),
            }

            if foreign_entity_contact_ppb["value"]:
                foreign_entity_contact_details.append(
                    foreign_entity_contact_ppb)

            # add contact details
            for cd in foreign_entity_contact_details:
                if cd['value'] != '':
                    _foreign_entity.add_contact_detail(**cd)

            # add extras
            fe_extras["contact_details_structured"] = [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": fe['foreign_entity_address'],
                        },
                        {
                            "note": "city",
                            "value": fe['foreign_entity_city'],
                        },
                        {
                            "note": "state",
                            "value": fe['foreign_entity_state'],
                        },
                        {
                            "note": "country",
                            "value": fe['foreign_entity_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "state",
                            "value": fe['foreign_entity_ppb_state'],
                        },
                        {
                            "note": "country",
                            "value": fe['foreign_entity_ppb_country'],
                        }
                    ],
                },
            ]

            _foreign_entity.extras = combine_dicts(_foreign_entity.extras,
                                                   fe_extras)

            _foreign_entities_by_name[fe_name] = _foreign_entity

        for unique_foreign_entity in _foreign_entities_by_name.values():
            _foreign_entities.append(unique_foreign_entity)

            # TODO: add a variant on memberships to represent inter-org
            # relationships (associations, ownership, etc)
            #
            # _client['memberships'].append({
            #     "id": _foreign_entity['id'],
            #     "classification": "organization",
            #     "name": _foreign_entity['name'],
            #     "extras": {
            #         "ownership_percentage":
            #             fe['foreign_entity_amount']
            #     }
            # })

        # Collect Lobbyists
        # TODO: deal with wierd non-name line continuation cases (blanks, "continued")
        _lobbyists_by_name = {}

        for l in parsed_form['lobbyists']:
            l_extras = {}
            l_name = ' '.join([l['lobbyist_first_name'],
                               l['lobbyist_last_name'],
                               l['lobbyist_suffix']
                               ]).strip()

            if l_name in _lobbyists_by_name:
                _lobbyist = _lobbyists_by_name[l_name]
            else:
                _lobbyist = Person(
                    name=l_name,
                    source_identified=True
                )

            if l['lobbyist_covered_official_position']:
                l_extras['lda_covered_official_positions'] = [
                    {
                        'date_reported':
                            parsed_form['datetimes']['effective_date'],
                        'covered_official_position':
                            l['lobbyist_covered_official_position']
                    },
                ]

            _lobbyist.extras = combine_dicts(_lobbyist.extras, l_extras)

            _lobbyists_by_name[l_name] = _lobbyist

        _lobbyists = []
        for unique_lobbyist in _lobbyists_by_name.values():
            _lobbyists.append(unique_lobbyist)

        if _registrant._type == 'organization':
            for l in _lobbyists:
                _registrant.add_member(
                    l,
                    role='lobbyist',
                    label='lobbyist for {n}'.format(n=_registrant.name),
                    start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
                )
        else:
            for l in _lobbyists:
                _registrant_self_employment.add_member(
                    l,
                    role='lobbyist',
                    label='lobbyist for {n}'.format(n=_registrant.name),
                    start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
                )

        # # Document
        # build document
        _disclosure.add_document(
            note='submitted filing',
            date=parsed_form['datetimes']['effective_date'][:10],
            url=response.url
        )

        # Collect Affiliated orgs
        _affiliated_organizations = []
        _affiliated_organizations_by_name = {}
        for ao in parsed_form['affiliated_organizations']:
            ao_extras = {}
            ao_name = ao['affiliated_organization_name']
            if ao_name in _affiliated_organizations_by_name:
                # There's already one by this name
                _affiliated_organization = _affiliated_organizations_by_name[ao_name]
            else:
                # New affiliated org
                _affiliated_organization = Organization(
                    name=ao_name,
                    classification='company',
                    source_identified=True
                )

            # collect contact details
            affiliated_organization_contact_details = [
                {
                    "type": "address",
                    "note": "contact address",
                    "value": '; '.join([
                        p for p in [
                            ao['affiliated_organization_address'],
                            ao['affiliated_organization_city'],
                            ao['affiliated_organization_state'],
                            ao['affiliated_organization_zip'],
                            ao['affiliated_organization_country']]
                        if len(p) > 0]).strip(),
                },
            ]

            affiliated_organization_contact_ppb = {
                "type": "address",
                "note": "principal place of business",
                "value": '; '.join([
                    p for p in [
                        ao['affiliated_organization_ppb_city'],
                        ao['affiliated_organization_ppb_state'],
                        ao['affiliated_organization_ppb_country']]
                    if len(p) > 0]).strip(),
            }

            if affiliated_organization_contact_ppb["value"]:
                affiliated_organization_contact_details.append(
                    affiliated_organization_contact_ppb)

            # add contact details
            for cd in affiliated_organization_contact_details:
                _affiliated_organization.add_contact_detail(**cd)

            ao_extras["contact_details_structured"] = [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": ao['affiliated_organization_address'],
                        },
                        {
                            "note": "city",
                            "value": ao['affiliated_organization_city'],
                        },
                        {
                            "note": "state",
                            "value": ao['affiliated_organization_state'],
                        },
                        {
                            "note": "zip",
                            "value": ao['affiliated_organization_zip'],
                        },
                        {
                            "note": "country",
                            "value": ao['affiliated_organization_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value":
                                ao['affiliated_organization_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value":
                                ao['affiliated_organization_ppb_state'],
                        },
                        {
                            "note": "country",
                            "value":
                                ao['affiliated_organization_ppb_country'],
                        }
                    ],
                },
            ],

            _affiliated_organization.extras = combine_dicts(
                _affiliated_organization.extras, ao_extras)

        for unique_affiliated_organization in _affiliated_organizations_by_name.values():
            _affiliated_organizations.append(unique_affiliated_organization)

        # # Events & Agendas
        # name
        if parsed_form['registration_type']['new_registrant']:
            registration_type = 'New Client, New Registrant'
        elif parsed_form['registration_type']['is_amendment']:
            registration_type = 'Amended Registration'
        else:
            registration_type = 'New Client for Existing Registrant'

        # Create registration event
        _event = Event(
            name="{rn} - {rt}, {cn}".format(rn=_registrant.name,
                                            rt=registration_type,
                                            cn=_client.name),
            timezone='America/New_York',
            location='United States',
            start_time=datetime.strptime(
                parsed_form['datetimes']['effective_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            classification='registration'
        )

        # add participants
        _event.add_participant(type=_registrant._type,
                               id=_registrant._id,
                               name=_registrant.name,
                               note="registrant")

        if _registrant._type == 'person':
            _event.add_participant(type=_registrant._type,
                                   id=_registrant._id,
                                   name=_registrant.name,
                                   note="registrant")

        _event.add_participant(type=_client._type,
                               id=_client._id,
                               name=_client.name,
                               note="client")

        for l in _lobbyists:
            _event.add_participant(type=l._type,
                                   id=l._id,
                                   name=l.name,
                                   note='lobbyist')

        for fe in _foreign_entities:
            _event.add_participant(type=fe._type,
                                   id=fe._id,
                                   name=fe.name,
                                   note='foreign_entity')

        for ao in _affiliated_organizations:
            _event.add_participant(type=ao._type,
                                   id=ao._id,
                                   name=ao.name,
                                   note='affiliated_organization')

        # add agenda item
        _agenda = _event.add_agenda_item(
            description='issues lobbied on',
        )

        _agenda['notes'].append(
            parsed_form['lobbying_issues_detail']
        )

        for li in parsed_form['lobbying_issues']:
            if li['general_issue_area'] != '':
                _agenda.add_subject(li['general_issue_area'])

        _disclosure.add_disclosed_event(
            name=_event.name,
            type=_event._type,
            classification=_event.classification,
            id=_event._id
        )

        # add registrant to disclosure's _related and related_entities fields
        _disclosure.add_registrant(name=_registrant.name,
                                   type=_registrant._type,
                                   id=_registrant._id)

        _registrant.add_source(
            url=_source['url'],
            note='registrant'
        )
        yield _registrant

        if _registrant_self_employment is not None:
            _registrant_self_employment.add_source(
                url=_source['url'],
                note='registrant_self_employment'
            )

            yield _registrant_self_employment

        _client.add_source(
            url=_source['url'],
            note='client'
        )
        yield _client

        _main_contact.add_source(
            url=_source['url'],
            note='main_contact'
        )
        yield _main_contact

        for ao in _affiliated_organizations:
            ao.add_source(
                url=_source['url'],
                note='affiliated_organization'
            )
            yield ao
        for fe in _foreign_entities:
            fe.add_source(
                url=_source['url'],
                note='foreign_entity'
            )
            yield fe
        for l in _lobbyists:
            l.add_source(
                url=_source['url'],
                note='lobbyist'
            )
            yield l

        _event.add_source(**_source)
        yield _event
        _disclosure.add_source(**_source)
        yield _disclosure
Пример #14
0
    def _scrape_lower_chamber(self):
        self.info("Scraping lower chamber for legislators.")

        chamber = "lower"

        roster_url = self._reps_url
        page = self.get(roster_url).text
        page = lxml.html.fromstring(page)
        # This is the ASP.net table container
        table_xpath = "//table[@id='theTable']"
        table = page.xpath(table_xpath)[0]
        for tr in table.xpath("tr")[3:]:
            # If a given term hasn't occurred yet, then ignore it
            # Eg, in 2017, the 2018 term page will have a blank table
            if tr.attrib.get("class") == "dxgvEmptyDataRow":
                self.warning("No House members found")
                return

            tds = tr.xpath("td")
            last_name = tds[1].text_content().strip()
            first_name = tds[2].text_content().strip()
            full_name = "{} {}".format(first_name, last_name)
            district = str(int(tds[3].text_content().strip()))
            party = tds[4].text_content().strip()
            if party == "D":
                party = "Democratic"
            elif party == "R":
                party = "Republican"

            if party.strip() == "":  # Workaround for now.
                party = "Other"

            phone = tds[6].text_content().strip()
            room = tds[7].text_content().strip()

            address = self._assumed_address_fmt.format(room if room else "")

            if last_name == "Vacant":
                person = Person(name=full_name,
                                primary_org=chamber,
                                district=district,
                                party=party)
                person.extras = {
                    "first_name": first_name,
                    "last_name": last_name
                }

                person.add_contact_detail(type="address",
                                          value=address,
                                          note="Capitol Office")
                if phone.strip():
                    person.add_contact_detail(type="voice",
                                              value=phone,
                                              note="Capitol Office")

                person.add_source(roster_url)

                self._save_vacant_legislator(person)
            else:
                party_override = {
                    " Green": "Democratic",
                    " Sisco": "Republican"
                }

                if party == "" and full_name in party_override:
                    party = party_override[full_name]

                details_url = self._rep_details_url.format(district)
                details_page = lxml.html.fromstring(self.get(details_url).text)

                person = Person(name=full_name,
                                primary_org=chamber,
                                district=district,
                                party=party)
                person.extras = {
                    "first_name": first_name,
                    "last_name": last_name
                }
                person.add_source(roster_url)
                person.add_source(details_url)
                person.add_link(details_url)

                email = details_page.xpath(
                    '//*[@id="ContentPlaceHolder1_lblAddresses"] '
                    '//a[starts-with(@href,"mailto:")]/@href')
                if len(email) > 0 and email[0].lower() != "mailto:":
                    email = email[0].split(":")[1]
                else:
                    email = None

                person.add_contact_detail(type="address",
                                          value=address,
                                          note="Capitol Office")
                if phone:
                    person.add_contact_detail(type="voice",
                                              value=phone,
                                              note="Capitol Office")
                if email:
                    person.add_contact_detail(type="email",
                                              value=email,
                                              note="Capitol Office")

                picture = details_page.xpath(
                    '//*[@id="ContentPlaceHolder1_imgPhoto"]/@src')
                if len(picture) > 0:
                    person.image = picture[0]

                yield person
Пример #15
0
    def scrape_session(self, session, chambers):
        sid = SESSION_SITE_IDS[session]
        members = backoff(self.sservice.GetMembersBySession,
                          sid)["MemberListing"]

        seen_guids = []
        for member in members:
            guid = member["Id"]
            member_info = backoff(self.sservice.GetMember, guid)

            # If a member switches chambers during the session, they may
            # appear twice. Skip the duplicate record accordingly.
            if guid in seen_guids:
                self.warning("Skipping duplicate record of {}".format(
                    member_info["Name"]["Last"]))
                continue
            else:
                seen_guids.append(guid)

            # Check to see if the member has vacated; skip if so.
            # A member can have multiple services for a given session,
            # if they switched chambers. Filter these down to just the
            # active service.
            try:
                (legislative_service, ) = [
                    service for service in member_info["SessionsInService"]
                    ["LegislativeService"] if service["Session"]["Id"] == sid
                    and service["DateVacated"] is None
                ]
            except ValueError:
                self.info("Skipping retired member {}".format(
                    member_info["Name"]["Last"]))
                continue

            nick_name, first_name, middle_name, last_name = (
                member_info["Name"][x]
                for x in ["Nickname", "First", "Middle", "Last"])

            first_name = nick_name if nick_name else first_name

            if middle_name:
                full_name = "%s %s %s" % (first_name, middle_name, last_name)
            else:
                full_name = "%s %s" % (first_name, last_name)

            party = legislative_service["Party"]

            if party == "Democrat":
                party = "Democratic"

            elif party.strip() == "":
                party = "other"

            chamber, district = (legislative_service["District"][x]
                                 for x in ["Type", "Number"])

            chamber = {"House": "lower", "Senate": "upper"}[chamber]

            url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber], {
                "code": guid,
                "sid": sid
            })

            legislator = Person(
                name=full_name,
                district=str(district),
                party=party,
                primary_org=chamber,
                image=photo,
            )
            legislator.extras = {
                "family_name": last_name,
                "given_name": first_name,
                "guid": guid,
            }

            if (member_info["Address"]["Street"] is not None
                    and member_info["Address"]["Street"].strip()):
                capitol_address_info = {
                    k: v.strip()
                    for k, v in dict(member_info["Address"]).items()
                    if k in ["Street", "City", "State", "Zip"]
                }
                capitol_address = "{Street}\n{City}, {State} {Zip}".format(
                    **capitol_address_info)
                legislator.add_contact_detail(type="address",
                                              value=capitol_address,
                                              note="Capitol Address")
            else:
                self.warning(
                    "Could not find full capitol address for {}".format(
                        full_name))

            capitol_contact_info = self.clean_list(
                [member_info["Address"][x] for x in ["Email", "Phone", "Fax"]])

            # Sometimes email is set to a long cryptic string.
            # If it doesn't have a @ character, simply set it to None
            # examples:
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=xT8jBs5X4S7ZX2TOajTx2W7CBprTaVlpcvUvHEv78GI=
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=eSH9vpfdy3XJ989Gpw4MOdUa3n55NTA8ev58RPJuzA8=
            if capitol_contact_info[0] and "@" not in capitol_contact_info[0]:
                capitol_contact_info[0] = None

            if capitol_contact_info[0]:
                # Site was hacked in the past
                assert "*****@*****.**" not in capitol_contact_info[0]

            if capitol_contact_info[1]:
                legislator.add_contact_detail(type="voice",
                                              value=capitol_contact_info[1],
                                              note="Capitol Address")
            if capitol_contact_info[2]:
                legislator.add_contact_detail(type="fax",
                                              value=capitol_contact_info[2],
                                              note="Capitol Address")
            if capitol_contact_info[0]:
                legislator.add_contact_detail(type="email",
                                              value=capitol_contact_info[0],
                                              note="Capitol Address")

            if (member_info["DistrictAddress"]["Street"] is not None
                    and member_info["DistrictAddress"]["Street"].strip()):
                district_address_info = {
                    k: v.strip()
                    for k, v in dict(member_info["DistrictAddress"]).items()
                    if k in ["Street", "City", "State", "Zip"]
                }
                district_address = "{Street}\n{City}, {State} {Zip}".format(
                    **district_address_info)
                legislator.add_contact_detail(type="address",
                                              value=district_address,
                                              note="District Address")
            else:
                self.warning(
                    "Could not find full district address for {}".format(
                        full_name))

            district_contact_info = self.clean_list([
                member_info["DistrictAddress"][x]
                for x in ["Email", "Phone", "Fax"]
            ])

            # Same issue with district email. See above comment
            if district_contact_info[0] and "@" not in district_contact_info[0]:
                district_contact_info[0] = None

            if district_contact_info[0]:
                # Site was hacked in the past
                assert "*****@*****.**" not in district_contact_info[0]

            if district_contact_info[1]:
                legislator.add_contact_detail(
                    type="voice",
                    value=district_contact_info[1],
                    note="District Address",
                )
            if district_contact_info[2]:
                legislator.add_contact_detail(type="fax",
                                              value=district_contact_info[2],
                                              note="District Address")
            if district_contact_info[0]:
                legislator.add_contact_detail(
                    type="email",
                    value=district_contact_info[0],
                    note="District Address",
                )

            legislator.add_link(url)
            legislator.add_source(self.ssource)
            legislator.add_source(HOMEPAGE_URLS[chamber].format(**{
                "code": guid,
                "sid": sid
            }))

            yield legislator
Пример #16
0
    def _scrape_lower_chamber(self):
        self.info('Scraping lower chamber for legislators.')

        chamber = 'lower'

        roster_url = (self._reps_url)
        page = self.get(roster_url).text
        page = lxml.html.fromstring(page)
        # This is the ASP.net table container
        table_xpath = ('id("ContentPlaceHolder1_'
                       'gridMembers_DXMainTable")')
        table = page.xpath(table_xpath)[0]
        for tr in table.xpath('tr')[1:]:
            # If a given term hasn't occurred yet, then ignore it
            # Eg, in 2017, the 2018 term page will have a blank table
            if tr.attrib.get('class') == 'dxgvEmptyDataRow':
                self.warning('No House members found')
                return

            tds = tr.xpath('td')
            last_name = tds[0].text_content().strip()
            first_name = tds[1].text_content().strip()
            full_name = '{} {}'.format(first_name, last_name)
            district = str(int(tds[2].text_content().strip()))
            party = tds[3].text_content().strip()
            if party == 'Democrat':
                party = 'Democratic'

            if party.strip() == "":  # Workaround for now.
                party = "Other"

            phone = tds[4].text_content().strip()
            room = tds[5].text_content().strip()
            address = self._assumed_address_fmt.format(room if room else '')

            if last_name == 'Vacant':
                person = Person(
                    name=full_name,
                    primary_org=chamber,
                    district=district,
                    party=party,
                )
                person.extras = {
                    'first_name': first_name,
                    'last_name': last_name,
                }

                person.add_contact_detail(type='address', value=address, note='Capitol Office')
                if phone.strip():
                    person.add_contact_detail(type='voice', value=phone, note='Capitol Office')

                person.add_source(roster_url)

                self._save_vacant_legislator(person)
            else:
                party_override = {" Green": "Democratic",
                                  " Sisco": "Republican"}

                if party == "" and full_name in party_override:
                    party = party_override[full_name]

                details_url = self._rep_details_url.format(district)
                details_page = lxml.html.fromstring(self.get(details_url).text)

                person = Person(
                    name=full_name,
                    primary_org=chamber,
                    district=district,
                    party=party,
                )
                person.extras = {
                    'first_name': first_name,
                    'last_name': last_name,
                }
                person.add_source(roster_url)
                person.add_source(details_url)
                person.add_link(details_url)

                email = details_page.xpath(
                    '//*[@id="ContentPlaceHolder1_lblAddresses"]'
                    '/table/tr[4]/td/a/@href'
                )
                if len(email) > 0 and email[0].lower() != 'mailto:':
                    email = email[0].split(':')[1]
                else:
                    email = None

                person.add_contact_detail(type='address', value=address, note='Capitol Office')
                if phone:
                    person.add_contact_detail(type='voice', value=phone, note='Capitol Office')
                if email:
                    person.add_contact_detail(type='email', value=email, note='Capitol Office')

                picture = details_page.xpath(
                    '//*[@id="ContentPlaceHolder1_imgPhoto"]/@src')
                if len(picture) > 0:
                    person.image = picture[0]

                yield person
Пример #17
0
    def scrape_session(self, session, chambers):
        sid = SESSION_SITE_IDS[session]
        members = backoff(
            self.sservice.GetMembersBySession,
            sid
        )['MemberListing']

        seen_guids = []
        for member in members:
            guid = member['Id']
            member_info = backoff(self.sservice.GetMember, guid)

            # If a member switches chambers during the session, they may
            # appear twice. Skip the duplicate record accordingly.
            if guid in seen_guids:
                self.warning('Skipping duplicate record of {}'.format(member_info['Name']['Last']))
                continue
            else:
                seen_guids.append(guid)

            # Check to see if the member has vacated; skip if so.
            # A member can have multiple services for a given session,
            # if they switched chambers. Filter these down to just the
            # active service.
            try:
                (legislative_service, ) = [
                    service for service
                    in member_info['SessionsInService']['LegislativeService']
                    if service['Session']['Id'] == sid and service['DateVacated'] is None
                ]
            except ValueError:
                self.info('Skipping retired member {}'.format(member_info['Name']['Last']))
                continue

            nick_name, first_name, middle_name, last_name = (
                member_info['Name'][x] for x in [
                    'Nickname', 'First', 'Middle', 'Last'
                ]
            )

            first_name = nick_name if nick_name else first_name

            if middle_name:
                full_name = "%s %s %s" % (first_name, middle_name, last_name)
            else:
                full_name = "%s %s" % (first_name, last_name)

            party = legislative_service['Party']

            if party == 'Democrat':
                party = 'Democratic'

            elif party.strip() == '':
                party = 'other'

            chamber, district = (
                legislative_service['District'][x] for x in [
                    'Type', 'Number'
                ]
            )

            chamber = {
                "House": 'lower',
                "Senate": 'upper'
            }[chamber]

            url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber],
                                              {"code": guid, "sid": sid})

            legislator = Person(
                name=full_name,
                district=str(district),
                party=party,
                primary_org=chamber,
                image=photo,
            )
            legislator.extras = {
                'family_name': last_name,
                'given_name': first_name,
                'guid': guid,
            }

            if member_info['Address']['Street'] is not None and \
                    member_info['Address']['Street'].strip():
                capitol_address_info = {
                    k: v.strip() for k, v
                    in dict(member_info['Address']).items()
                    if k in ['Street', 'City', 'State', 'Zip']
                }
                capitol_address = '{Street}\n{City}, {State} {Zip}'.format(**capitol_address_info)
                legislator.add_contact_detail(
                    type='address', value=capitol_address, note='Capitol Address')
            else:
                self.warning('Could not find full capitol address for {}'.format(full_name))

            capitol_contact_info = self.clean_list([
                member_info['Address'][x] for x in [
                    'Email', 'Phone', 'Fax'
                ]
            ])

            # Sometimes email is set to a long cryptic string.
            # If it doesn't have a @ character, simply set it to None
            # examples:
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=xT8jBs5X4S7ZX2TOajTx2W7CBprTaVlpcvUvHEv78GI=
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=eSH9vpfdy3XJ989Gpw4MOdUa3n55NTA8ev58RPJuzA8=
            if capitol_contact_info[0] and '@' not in capitol_contact_info[0]:
                capitol_contact_info[0] = None

            if capitol_contact_info[0]:
                # Site was hacked in the past
                assert '*****@*****.**' not in capitol_contact_info[0]

            if capitol_contact_info[1]:
                legislator.add_contact_detail(
                    type='voice', value=capitol_contact_info[1], note='Capitol Address')
            if capitol_contact_info[2]:
                legislator.add_contact_detail(
                    type='fax', value=capitol_contact_info[2], note='Capitol Address')
            if capitol_contact_info[0]:
                legislator.add_contact_detail(
                    type='email', value=capitol_contact_info[0], note='Capitol Address')

            if member_info['DistrictAddress']['Street'] is not None and \
                    member_info['DistrictAddress']['Street'].strip():
                district_address_info = {
                    k: v.strip() for k, v
                    in dict(member_info['DistrictAddress']).items()
                    if k in ['Street', 'City', 'State', 'Zip']
                }
                district_address = '{Street}\n{City}, {State} {Zip}'.format(
                        **district_address_info)
                legislator.add_contact_detail(
                    type='address', value=district_address, note='District Address')
            else:
                self.warning('Could not find full district address for {}'.format(full_name))

            district_contact_info = self.clean_list([
                member_info['DistrictAddress'][x] for x in [
                    'Email', 'Phone', 'Fax'
                ]
            ])

            # Same issue with district email. See above comment
            if district_contact_info[0] and '@' not in district_contact_info[0]:
                district_contact_info[0] = None

            if district_contact_info[0]:
                # Site was hacked in the past
                assert '*****@*****.**' not in district_contact_info[0]

            if district_contact_info[1]:
                legislator.add_contact_detail(
                    type='voice', value=district_contact_info[1], note='District Address')
            if district_contact_info[2]:
                legislator.add_contact_detail(
                    type='fax', value=district_contact_info[2], note='District Address')
            if district_contact_info[0]:
                legislator.add_contact_detail(
                    type='email', value=district_contact_info[0], note='District Address')

            legislator.add_link(url)
            legislator.add_source(self.ssource)
            legislator.add_source(HOMEPAGE_URLS[chamber].format(
                **{"code": guid, "sid": sid}))

            yield legislator
Пример #18
0
    def scrape(self):
        noncommittees = {'Committee of the Whole'}
        committee_d = {}

        people_d = {}

        # Go to memberlist
        extra_args = {'ctl00$ContentPlaceHolder$lstName': 'City Council'}

        for councilman, committees in self.councilMembers(
                extra_args=extra_args):

            if 'url' in councilman['Person Name']:
                councilman_url = councilman['Person Name']['url']

                if councilman_url in people_d:
                    people_d[councilman_url][0].append(councilman)
                else:
                    people_d[councilman_url] = [councilman], committees

        for person_entries, committees in people_d.values():

            councilman = person_entries[-1]

            p = Person(councilman['Person Name']['label'])

            if p.name == 'Letitia James':
                p.name = 'Letitia Ms. James'
                p.add_name('Letitia James')

            spans = [(self.toTime(entry['Start Date']).date(),
                      self.toTime(entry['End Date']).date(), entry['District'])
                     for entry in person_entries]

            merged_spans = []
            last_end_date = None
            last_district = None
            for start_date, end_date, district in sorted(spans):
                if last_end_date is None:
                    span = [start_date, end_date, district]
                elif (start_date - last_end_date
                      ) == datetime.timedelta(1) and district == last_district:
                    span[1] = end_date
                else:
                    merged_spans.append(span)
                    span = [start_date, end_date, district]

                last_end_date = end_date
                last_district = district

            merged_spans.append(span)

            for start_date, end_date, district in merged_spans:
                district = councilman['District'].replace(' 0', ' ')
                if end_date == datetime.date(2017, 12, 31):
                    end_date = ''
                else:
                    end_date = end_date.isoformat()
                print(start_date, end_date)
                p.add_term('Council Member',
                           'legislature',
                           district=district,
                           start_date=start_date.isoformat(),
                           end_date=end_date)

            party = councilman['Political Party']
            if party == 'Democrat':
                party = 'Democratic'

            if party:
                p.add_party(party)

            if councilman['Photo']:
                p.image = councilman['Photo']

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['url'],
                                     note='E-mail')

            if councilman['Web site']:
                p.add_link(councilman['Web site']['url'], note='web site')

            p.extras = {'Notes': councilman['Notes']}

            p.add_source(councilman['Person Name']['url'], note='web')

            for committee, _, _ in committees:
                committee_name = committee['Department Name']['label']
                if committee_name not in noncommittees and 'committee' in committee_name.lower(
                ):
                    o = committee_d.get(committee_name, None)
                    if o is None:
                        parent_id = PARENT_ORGS.get(committee_name,
                                                    'New York City Council')
                        o = Organization(committee_name,
                                         classification='committee',
                                         parent_id={'name': parent_id})
                        o.add_source(committee['Department Name']['url'])
                        committee_d[committee_name] = o

                    membership = o.add_member(p, role=committee["Title"])
                    membership.start_date = self.mdY2Ymd(
                        committee["Start Date"])
            yield p

        for o in committee_d.values():
            if 'Committee' in o.name:
                yield o

        for o in committee_d.values():
            if 'Subcommittee' in o.name:
                yield o

        o = Organization(
            'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services',
            classification='committee',
            parent_id={'name': 'New York City Council'})
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o

        o = Organization(
            'Subcommittee on Drug Abuse',
            classification='committee',
            parent_id={
                'name':
                'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services'
            })
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o
Пример #19
0
    def scrape_session(self, session, chambers):
        sid = SESSION_SITE_IDS[session]
        members = backoff(self.sservice.GetMembersBySession,
                          sid)['MemberListing']

        seen_guids = []
        for member in members:
            guid = member['Id']
            member_info = backoff(self.sservice.GetMember, guid)

            # If a member switches chambers during the session, they may
            # appear twice. Skip the duplicate record accordingly.
            if guid in seen_guids:
                self.warning('Skipping duplicate record of {}'.format(
                    member_info['Name']['Last']))
                continue
            else:
                seen_guids.append(guid)

            # Check to see if the member has vacated; skip if so.
            # A member can have multiple services for a given session,
            # if they switched chambers. Filter these down to just the
            # active service.
            try:
                (legislative_service, ) = [
                    service for service in member_info['SessionsInService']
                    ['LegislativeService'] if service['Session']['Id'] == sid
                    and service['DateVacated'] is None
                ]
            except ValueError:
                self.info('Skipping retired member {}'.format(
                    member_info['Name']['Last']))
                continue

            nick_name, first_name, middle_name, last_name = (
                member_info['Name'][x]
                for x in ['Nickname', 'First', 'Middle', 'Last'])

            first_name = nick_name if nick_name else first_name

            if middle_name:
                full_name = "%s %s %s" % (first_name, middle_name, last_name)
            else:
                full_name = "%s %s" % (first_name, last_name)

            party = legislative_service['Party']

            if party == 'Democrat':
                party = 'Democratic'

            elif party.strip() == '':
                party = 'other'

            chamber, district = (legislative_service['District'][x]
                                 for x in ['Type', 'Number'])

            chamber = {"House": 'lower', "Senate": 'upper'}[chamber]

            url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber], {
                "code": guid,
                "sid": sid
            })

            legislator = Person(
                name=full_name,
                district=str(district),
                party=party,
                primary_org=chamber,
                image=photo,
            )
            legislator.extras = {
                'last_name': last_name,
                'first_name': first_name,
                'guid': guid,
            }

            if member_info['Address']['Street'] is not None and \
                    member_info['Address']['Street'].strip():
                capitol_address_info = {
                    k: v.strip()
                    for k, v in dict(member_info['Address']).items()
                    if k in ['Street', 'City', 'State', 'Zip']
                }
                capitol_address = '{Street}\n{City}, {State} {Zip}'.format(
                    **capitol_address_info)
                legislator.add_contact_detail(type='address',
                                              value=capitol_address,
                                              note='Capitol Address')
            else:
                self.warning(
                    'Could not find full capitol address for {}'.format(
                        full_name))

            capitol_contact_info = self.clean_list(
                [member_info['Address'][x] for x in ['Email', 'Phone', 'Fax']])

            # Sometimes email is set to a long cryptic string.
            # If it doesn't have a @ character, simply set it to None
            # examples:
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=xT8jBs5X4S7ZX2TOajTx2W7CBprTaVlpcvUvHEv78GI=
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=eSH9vpfdy3XJ989Gpw4MOdUa3n55NTA8ev58RPJuzA8=
            if capitol_contact_info[0] and '@' not in capitol_contact_info[0]:
                capitol_contact_info[0] = None

            if capitol_contact_info[0]:
                # Site was hacked in the past
                assert '*****@*****.**' not in capitol_contact_info[0]

            if capitol_contact_info[1]:
                legislator.add_contact_detail(type='voice',
                                              value=capitol_contact_info[1],
                                              note='Capitol Address')
            if capitol_contact_info[2]:
                legislator.add_contact_detail(type='fax',
                                              value=capitol_contact_info[2],
                                              note='Capitol Address')
            if capitol_contact_info[0]:
                legislator.add_contact_detail(type='email',
                                              value=capitol_contact_info[0],
                                              note='Capitol Address')

            if member_info['DistrictAddress']['Street'] is not None and \
                    member_info['DistrictAddress']['Street'].strip():
                district_address_info = {
                    k: v.strip()
                    for k, v in dict(member_info['DistrictAddress']).items()
                    if k in ['Street', 'City', 'State', 'Zip']
                }
                district_address = '{Street}\n{City}, {State} {Zip}'.format(
                    **district_address_info)
                legislator.add_contact_detail(type='address',
                                              value=district_address,
                                              note='District Address')
            else:
                self.warning(
                    'Could not find full district address for {}'.format(
                        full_name))

            district_contact_info = self.clean_list([
                member_info['DistrictAddress'][x]
                for x in ['Email', 'Phone', 'Fax']
            ])

            # Same issue with district email. See above comment
            if district_contact_info[0] and '@' not in district_contact_info[0]:
                district_contact_info[0] = None

            if district_contact_info[0]:
                # Site was hacked in the past
                assert '*****@*****.**' not in district_contact_info[0]

            if district_contact_info[1]:
                legislator.add_contact_detail(type='voice',
                                              value=district_contact_info[1],
                                              note='District Address')
            if district_contact_info[2]:
                legislator.add_contact_detail(type='fax',
                                              value=district_contact_info[2],
                                              note='District Address')
            if district_contact_info[0]:
                legislator.add_contact_detail(type='email',
                                              value=district_contact_info[0],
                                              note='District Address')

            legislator.add_link(url)
            legislator.add_source(self.ssource)
            legislator.add_source(HOMEPAGE_URLS[chamber].format(**{
                "code": guid,
                "sid": sid
            }))

            yield legislator