Python Person.extras примеры использования

Язык программирования: Python

Пространство имен/Пакет: pupa.scrape

Класс/Тип: Person

Метод/Функция: extras

Примеров на hotexamples.com: 19

Python Person.extras - 19 примеров найдено. Это лучшие примеры Python кода для pupa.scrape.Person.extras, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Person(30)

add_contact_detail(30)

add_link(30)

add_source(30)

add_membership(23)

add_term(23)

as_dict(21)

extras(12)

add_party(8)

add_identifier(7)

birth_date(5)

add_name(4)

add_contact_deatil(2)

add_member(1)

biography(1)

contact_details(1)

Пример #1

Показать файл

    def get_member(self, session, chamber, kpid):
        url = "%smembers/%s" % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)["content"]

        party = content["PARTY"]
        if party == "Democrat":
            party = "Democratic"

        slug = {
            "2013-2014": "b2013_14",
            "2015-2016": "b2015_16",
            "2017-2018": "b2017_18",
            "2019-2020": "b2019_20",
        }[session]
        leg_url = "http://www.kslegislature.org/li/%s/members/%s/" % (slug,
                                                                      kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            (photo_url,
             ) = legislator_page.xpath('//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(
                content["FULLNAME"]))
            leg_url = ""
            photo_url = ""

        person = Person(
            name=content["FULLNAME"],
            district=str(content["DISTRICT"]),
            primary_org=chamber,
            party=party,
            image=photo_url,
        )
        person.extras = {"occupation": content["OCCUPATION"]}

        address = "\n".join([
            "Room {}".format(content["OFFICENUM"]),
            "Kansas State Capitol Building",
            "300 SW 10th St.",
            "Topeka, KS 66612",
        ])

        note = "Capitol Office"
        person.add_contact_detail(type="address", value=address, note=note)
        person.add_contact_detail(type="email",
                                  value=content["EMAIL"],
                                  note=note)
        if content["OFFPH"]:
            person.add_contact_detail(type="voice",
                                      value=content["OFFPH"],
                                      note=note)

        person.add_source(url)
        person.add_link(leg_url)

        yield person

Пример #2

Показать файл

    def get_member(self, session, chamber, kpid):
        url = '%smembers/%s' % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)['content']

        party = content['PARTY']
        if party == 'Democrat':
            party = 'Democratic'

        slug = {
            '2013-2014': 'b2013_14',
            '2015-2016': 'b2015_16',
            '2017-2018': 'b2017_18'
        }[session]
        leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug,
                                                                      kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            photo_url, = legislator_page.xpath(
                '//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(
                content['FULLNAME']))
            leg_url = ''
            photo_url = ''

        person = Person(
            name=content['FULLNAME'],
            district=str(content['DISTRICT']),
            primary_org=chamber,
            party=party,
            image=photo_url,
        )
        person.extras = {'occupation': content['OCCUPATION']}

        address = '\n'.join([
            'Room {}'.format(content['OFFICENUM']),
            'Kansas State Capitol Building',
            '300 SW 10th St.',
            'Topeka, KS 66612',
        ])

        note = 'Capitol Office'
        person.add_contact_detail(type='address', value=address, note=note)
        person.add_contact_detail(type='email',
                                  value=content['EMAIL'],
                                  note=note)
        if content['OFFPH']:
            person.add_contact_detail(type='voice',
                                      value=content['OFFPH'],
                                      note=note)

        person.add_source(url)
        person.add_link(leg_url)

        yield person

Пример #3

Показать файл

Файл: people.py Проект: sunlightlabs/openstates

    def get_member(self, session, chamber, kpid):
        url = '%smembers/%s' % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)['content']

        party = content['PARTY']
        if party == 'Democrat':
            party = 'Democratic'

        slug = {'2013-2014': 'b2013_14',
                '2015-2016': 'b2015_16',
                '2017-2018': 'b2017_18',
                '2019-2020': 'b2019_20',
                }[session]
        leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug, kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            photo_url, = legislator_page.xpath(
                '//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(content['FULLNAME']))
            leg_url = ''
            photo_url = ''

        person = Person(
            name=content['FULLNAME'],
            district=str(content['DISTRICT']),
            primary_org=chamber,
            party=party,
            image=photo_url,
        )
        person.extras = {'occupation': content['OCCUPATION']}

        address = '\n'.join([
            'Room {}'.format(content['OFFICENUM']),
            'Kansas State Capitol Building',
            '300 SW 10th St.',
            'Topeka, KS 66612',
        ])

        note = 'Capitol Office'
        person.add_contact_detail(type='address', value=address, note=note)
        person.add_contact_detail(type='email', value=content['EMAIL'], note=note)
        if content['OFFPH']:
            person.add_contact_detail(type='voice', value=content['OFFPH'], note=note)

        person.add_source(url)
        person.add_link(leg_url)

        yield person

Пример #4

Показать файл

Файл: 43897_people.py Проект: tate11/intelligent-code-completion

    def _scrape_lower_chamber(self):
        self.info('Scraping lower chamber for legislators.')

        chamber = 'lower'

        roster_url = (self._reps_url)
        page = self.get(roster_url).text
        page = lxml.html.fromstring(page)
        # This is the ASP.net table container
        table_xpath = ('id("ContentPlaceHolder1_' 'gridMembers_DXMainTable")')
        table = page.xpath(table_xpath)[0]
        for tr in table.xpath('tr')[1:]:
            # If a given term hasn't occurred yet, then ignore it
            # Eg, in 2017, the 2018 term page will have a blank table
            if tr.attrib.get('class') == 'dxgvEmptyDataRow':
                self.warning('No House members found')
                return

            tds = tr.xpath('td')
            last_name = tds[0].text_content().strip()
            first_name = tds[1].text_content().strip()
            full_name = '{} {}'.format(first_name, last_name)
            district = str(int(tds[2].text_content().strip()))
            party = tds[3].text_content().strip()
            if party == 'Democrat':
                party = 'Democratic'

            if party.strip() == "":  # Workaround for now.
                party = "Other"

            phone = tds[4].text_content().strip()
            room = tds[5].text_content().strip()
            address = self._assumed_address_fmt.format(room if room else '')

            if last_name == 'Vacant':
                person = Person(
                    name=full_name,
                    primary_org=chamber,
                    district=district,
                    party=party,
                )
                person.extras = {
                    'first_name': first_name,
                    'last_name': last_name,
                }

                person.add_contact_detail(type='address',
                                          value=address,
                                          note='Capitol Office')
                if phone.strip():
                    person.add_contact_detail(type='voice',
                                              value=phone,
                                              note='Capitol Office')

                person.add_source(roster_url)

                self._save_vacant_legislator(person)
            else:
                party_override = {
                    " Green": "Democratic",
                    " Sisco": "Republican"
                }

                if party == "" and full_name in party_override:
                    party = party_override[full_name]

                details_url = self._rep_details_url.format(district)
                details_page = lxml.html.fromstring(self.get(details_url).text)

                person = Person(
                    name=full_name,
                    primary_org=chamber,
                    district=district,
                    party=party,
                )
                person.extras = {
                    'first_name': first_name,
                    'last_name': last_name,
                }
                person.add_source(roster_url)
                person.add_source(details_url)
                person.add_link(details_url)

                email = details_page.xpath(
                    '//*[@id="ContentPlaceHolder1_lblAddresses"]'
                    '/table/tr[4]/td/a/@href')
                if len(email) > 0 and email[0].lower() != 'mailto:':
                    email = email[0].split(':')[1]
                else:
                    email = None

                person.add_contact_detail(type='address',
                                          value=address,
                                          note='Capitol Office')
                if phone:
                    person.add_contact_detail(type='voice',
                                              value=phone,
                                              note='Capitol Office')
                if email:
                    person.add_contact_detail(type='email',
                                              value=email,
                                              note='Capitol Office')

                picture = details_page.xpath(
                    '//*[@id="ContentPlaceHolder1_imgPhoto"]/@src')
                if len(picture) > 0:
                    person.image = picture[0]

                yield person

Пример #5

Показать файл

Файл: people.py Проект: neelneelpurk/openstates

    def _parse_person(self, row, chamber, seat_map):
        # Capture legislator vitals.
        first_name = row['FirstName']
        middle_name = row['MiddleName']
        last_name = row['LastName']
        full_name = '{} {} {}'.format(first_name, middle_name, last_name)
        full_name = re.sub(r'[\s]{2,}', ' ', full_name)

        if chamber == 'lower':
            district = '{} {}'.format(row['County'], int(row['District'])).strip()
        else:
            district = str(int(row['District'])).strip()

        party = self.party_map[row['party'].upper()]
        email = row['WorkEmail']

        print(district)
        person = Person(primary_org=chamber,
                        district=district,
                        name=full_name,
                        party=party)

        extras = {
            'first_name': first_name,
            'middle_name': middle_name,
            'last_name': last_name
        }

        person.extras = extras
        if email:
            person.add_contact_detail(type='email', value=email, note='District Office')

        # Capture legislator office contact information.
        district_address = '{}\n{}\n{}, {} {}'.format(row['Address'],
                                                      row['address2'],
                                                      row['city'], row['State'],
                                                      row['Zipcode']).strip()

        phone = row['Phone'].strip()
        if not phone:
            phone = None

        if district_address:
            person.add_contact_detail(type='address', value=district_address, note='Home Office')
        if phone:
            person.add_contact_detail(type='voice', value=phone, note='Home Office')

        # Retrieve legislator portrait.
        profile_url = None
        if chamber == 'upper':
            profile_url = self.senate_profile_url.format(row['District'])
        elif chamber == 'lower':
            try:
                seat_number = seat_map[row['seatno']]
                profile_url = self.house_profile_url.format(seat_number)
            except KeyError:
                pass

        if profile_url:
            person.image = self._get_photo(profile_url, chamber)
            person.add_source(profile_url)

        return person

Пример #6

Показать файл

Файл: people.py Проект: datamade/scrapers-us-municipal

    def scrape(self):
        web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute)
        web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody'

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}

        for member, _ in web_scraper.councilMembers():
            name = member['Person Name']['label'].strip()
            web_info[name] = member

        city_council, = [body for body in self.bodies()
                         if body['BodyName'] == 'City Council']

        terms = collections.defaultdict(list)

        public_advocates = {  # Match casing to Bill De Blasio as council member
            'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio',
            'The Public Advocate (Ms. James)': 'Letitia James',
        }

        for office in self.body_offices(city_council):
            name = office['OfficeRecordFullName']
            name = public_advocates.get(name, name).strip()

            terms[name].append(office)

            # Add past members (and advocates public)
            if name not in web_info:
                web_info[name] = collections.defaultdict(lambda: None)

        # Check that we have everyone we expect, formatted consistently, in
        # both information arrays. For instance, this will fail if we forget to
        # strip trailing spaces from names on one side or the other (which has
        # the effect of omitting information, such as post, from the scrape).

        assert set(web_info.keys()) == set(terms.keys())

        members = {}

        for member, offices in terms.items():

            p = Person(member)

            web = web_info[member]

            for term in offices:
                role = term['OfficeRecordTitle']

                if role == 'Public Advocate':
                    role = 'Non-Voting Council Member'
                else:
                    role = 'Council Member'

                district = web.get('District', '').replace(' 0', ' ')

                p.add_term(role,
                           'legislature',
                           district=district,
                           start_date=self.toDate(term['OfficeRecordStartDate']),
                           end_date=self.toDate(term['OfficeRecordEndDate']))

                party = web.get('Political Party')

                if party == 'Democrat':
                    party = 'Democratic'

                if party:
                    p.add_party(party)

                if web.get('Photo'):
                    p.image = web['Photo']

                contact_types = {
                    "City Hall Office": ("address", "City Hall Office"),
                    "City Hall Phone": ("voice", "City Hall Phone"),
                    "Ward Office Phone": ("voice", "Ward Office Phone"),
                    "Ward Office Address": ("address", "Ward Office Address"),
                    "Fax": ("fax", "Fax")
                }

                for contact_type, (type_, _note) in contact_types.items():
                    if web.get(contact_type) and web(contact_type) != 'N/A':
                        p.add_contact_detail(type=type_,
                                             value= web[contact_type],
                                             note=_note)

                if web.get('E-mail'):
                    p.add_contact_detail(type="email",
                                         value=web['E-mail']['url'],
                                         note='E-mail')

                if web.get('Web site'):
                    p.add_link(web['Web site']['url'], note='web site')

                if web.get('Notes'):
                    p.extras = {'Notes': web['Notes']}

                if not p.sources:  # Only add sources once
                    source_urls = self.person_sources_from_office(term)
                    person_api_url, person_web_url = source_urls
                    p.add_source(person_api_url, note='api')
                    p.add_source(person_web_url, note='web')

            members[member] = p

        committee_types = ['Committee',
                           'Inactive Committee',
                           'Select Committee',
                           'Subcommittee',
                           'Task Force',
                           'Land Use', # Committee on Land Use
                          ]

        body_types = {k: v for k, v in self.body_types().items()
                      if k in committee_types}

        for body in self.bodies():
            if body['BodyTypeName'] in body_types \
                or body['BodyName'] in ('Legislative Documents Unit',
                                        'Legal and Government Affairs Division'):

                # Skip typo in API data
                if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services':
                    continue

                parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council')

                body_name = body['BodyName']

                o = Organization(body_name,
                                 classification='committee',
                                 parent_id={'name': parent_org})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web')

                for office in self.body_offices(body):
                    # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio',
                    # 'Committee Member', None, 'CHAIRPERSON'

                    role = office['OfficeRecordTitle']

                    if role and role.lower() == 'chairperson':
                        role = 'Chairperson'
                    else:
                        role = 'Member'

                    person = office['OfficeRecordFullName']
                    person = public_advocates.get(person, person).strip()

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(o,
                                     role=role,
                                     start_date=self.toDate(office['OfficeRecordStartDate']),
                                     end_date=self.toDate(office['OfficeRecordEndDate']))

                yield o

        for p in members.values():
            yield p

Пример #7

Показать файл

    def scrape(self):
        web_scraper = LegistarPersonScraper(
            requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody'

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}

        for member, _ in web_scraper.councilMembers():
            name = member['Person Name']['label'].strip()
            web_info[name] = member

        city_council, = [
            body for body in self.bodies()
            if body['BodyName'] == 'City Council'
        ]

        terms = collections.defaultdict(list)

        public_advocates = {  # Match casing to Bill De Blasio as council member
            'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio',
            'The Public Advocate (Ms. James)': 'Letitia James',
        }

        for office in self.body_offices(city_council):
            name = office['OfficeRecordFullName']
            name = public_advocates.get(name, name).strip()

            terms[name].append(office)

            # Add past members (and advocates public)
            if name not in web_info:
                web_info[name] = collections.defaultdict(lambda: None)

        # Check that we have everyone we expect, formatted consistently, in
        # both information arrays. For instance, this will fail if we forget to
        # strip trailing spaces from names on one side or the other (which has
        # the effect of omitting information, such as post, from the scrape).

        assert set(web_info.keys()) == set(terms.keys())

        members = {}

        for member, offices in terms.items():

            p = Person(member)

            web = web_info[member]

            for term in offices:
                role = term['OfficeRecordTitle']

                if role == 'Public Advocate':
                    role = 'Non-Voting Council Member'
                else:
                    role = 'Council Member'

                district = web.get('District', '').replace(' 0', ' ')

                p.add_term(role,
                           'legislature',
                           district=district,
                           start_date=self.toDate(
                               term['OfficeRecordStartDate']),
                           end_date=self.toDate(term['OfficeRecordEndDate']))

                party = web.get('Political Party')

                if party == 'Democrat':
                    party = 'Democratic'

                if party:
                    p.add_party(party)

                if web.get('Photo'):
                    p.image = web['Photo']

                contact_types = {
                    "City Hall Office": ("address", "City Hall Office"),
                    "City Hall Phone": ("voice", "City Hall Phone"),
                    "Ward Office Phone": ("voice", "Ward Office Phone"),
                    "Ward Office Address": ("address", "Ward Office Address"),
                    "Fax": ("fax", "Fax")
                }

                for contact_type, (type_, _note) in contact_types.items():
                    if web.get(contact_type) and web(contact_type) != 'N/A':
                        p.add_contact_detail(type=type_,
                                             value=web[contact_type],
                                             note=_note)

                if web.get('E-mail'):
                    p.add_contact_detail(type="email",
                                         value=web['E-mail']['url'],
                                         note='E-mail')

                if web.get('Web site'):
                    p.add_link(web['Web site']['url'], note='web site')

                if web.get('Notes'):
                    p.extras = {'Notes': web['Notes']}

                if not p.sources:  # Only add sources once
                    source_urls = self.person_sources_from_office(term)
                    person_api_url, person_web_url = source_urls
                    p.add_source(person_api_url, note='api')
                    p.add_source(person_web_url, note='web')

            members[member] = p

        committee_types = [
            'Committee', 'Inactive Committee', 'Select Committee',
            'Subcommittee', 'Task Force', 'Land Use'
        ]  # Committee on Land Use

        body_types = {
            k: v
            for k, v in self.body_types().items() if k in committee_types
        }

        for body in self.bodies():
            if body['BodyTypeName'] in body_types \
                or body['BodyName'] in ('Legislative Documents Unit',
                                        'Legal and Government Affairs Division'):

                # Skip typo in API data
                if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services':
                    continue

                parent_org = PARENT_ORGS.get(body['BodyName'],
                                             'New York City Council')

                body_name = body['BodyName']

                o = Organization(body_name,
                                 classification='committee',
                                 parent_id={'name': parent_org})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(
                    self.WEB_URL +
                    '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.
                    format(**body),
                    note='web')

                for office in self.body_offices(body):
                    # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio',
                    # 'Committee Member', None, 'CHAIRPERSON'

                    role = office['OfficeRecordTitle']

                    if role and role.lower() == 'chairperson':
                        role = 'Chairperson'
                    else:
                        role = 'Member'

                    person = office['OfficeRecordFullName']
                    person = public_advocates.get(person, person).strip()

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(o,
                                     role=role,
                                     start_date=self.toDate(
                                         office['OfficeRecordStartDate']),
                                     end_date=self.toDate(
                                         office['OfficeRecordEndDate']))

                yield o

        for p in members.values():
            yield p

Пример #8

Показать файл

Файл: people.py Проект: Code-for-Miami/scrapers-us-municipal

    def scrape(self):
        noncommittees = {'Committee of the Whole'}
        committee_d = {}

        people_d = {}

        for councilman, committees in self.councilMembers() :

            
            if 'url' in councilman['Person Name'] :
                councilman_url = councilman['Person Name']['url']

                if councilman_url in people_d :
                    people_d[councilman_url][0].append(councilman) 
                else :
                    people_d[councilman_url] = [councilman], committees

        for person_entries, committees in people_d.values() :

            councilman = person_entries[-1]
            
            p = Person(councilman['Person Name']['label'])
            
            if p.name == 'Letitia James' :
                p.name = 'Letitia Ms. James'
                p.add_name('Letitia James')

            spans = [(self.toTime(entry['Start Date']).date(), 
                      self.toTime(entry['End Date']).date(),
                      entry['District'])
                     for entry in person_entries]

            merged_spans = []
            last_end_date = None
            last_district = None
            for start_date, end_date, district in sorted(spans) :
                if last_end_date is None :
                    span = [start_date, end_date, district]
                elif (start_date - last_end_date) == datetime.timedelta(1) and district == last_district :
                    span[1] = end_date
                else :
                    merged_spans.append(span)
                    span = [start_date, end_date, district]

                last_end_date = end_date
                last_district = district

            merged_spans.append(span)

            for start_date, end_date, district in merged_spans :
                district = councilman['District'].replace(' 0', ' ')
                if end_date == datetime.date(2017, 12, 31) :
                    end_date = ''
                else :
                    end_date = end_date.isoformat()
                print(start_date, end_date)
                p.add_term('Council Member', 'legislature', 
                           district=district, 
                           start_date=start_date.isoformat(),
                           end_date=end_date)

            party = councilman['Political Party']
            if party == 'Democrat' :
                party = 'Democratic'
            
            if party :
                p.add_party(party)

            if councilman['Photo'] :
                p.image = councilman['Photo']

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['url'],
                                     note='E-mail')

            if councilman['Web site']:
                p.add_link(councilman['Web site']['url'], note='web site')

            p.extras = {'Notes' : councilman['Notes']}
                 
            p.add_source(councilman['Person Name']['url'], note='web')

            for committee, _, _ in committees:
                committee_name = committee['Department Name']['label']
                if committee_name not in noncommittees and 'committee' in committee_name.lower():
                    o = committee_d.get(committee_name, None)
                    if o is None:
                        parent_id = PARENT_ORGS.get(committee_name,
                                                    'New York City Council')
                        o = Organization(committee_name,
                                         classification='committee',
                                         parent_id={'name' : parent_id})
                        o.add_source(committee['Department Name']['url'])
                        committee_d[committee_name] = o

                    membership = o.add_member(p, role=committee["Title"])
                    membership.start_date = self.mdY2Ymd(committee["Start Date"])
            yield p
            

        for o in committee_d.values() :
            if 'Committee' in o.name :
                yield o

        for o in committee_d.values() :
            if 'Subcommittee' in o.name :
                yield o

        o = Organization('Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services',
                         classification='committee',
                         parent_id={'name' : 'New York City Council'})
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o

        o = Organization('Subcommittee on Drug Abuse',
                         classification='committee',
                         parent_id={'name' : 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services'})
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o

Пример #9

Показать файл

Файл: people.py Проект: neelneelpurk/openstates

    def scrape_session(self, session, chambers):
        sid = SESSION_SITE_IDS[session]
        members = backoff(
            self.sservice.GetMembersBySession,
            sid
        )['MemberListing']

        for member in members:
            guid = member['Id']
            member_info = backoff(self.sservice.GetMember, guid)

            # Check to see if the member has vacated; skip if so:
            try:
                legislative_service = next(
                    service for service
                    in member_info['SessionsInService']['LegislativeService']
                    if service['Session']['Id'] == sid
                )
            except IndexError:
                raise Exception("Something very bad is going on with the "
                                "Legislative service")

            if legislative_service['DateVacated']:
                continue

            nick_name, first_name, middle_name, last_name = (
                member_info['Name'][x] for x in [
                    'Nickname', 'First', 'Middle', 'Last'
                ]
            )

            first_name = nick_name if nick_name else first_name

            if middle_name:
                full_name = "%s %s %s" % (first_name, middle_name, last_name)
            else:
                full_name = "%s %s" % (first_name, last_name)

            party = legislative_service['Party']

            if party == 'Democrat':
                party = 'Democratic'

            elif party.strip() == '':
                party = 'other'

            chamber, district = (
                legislative_service['District'][x] for x in [
                    'Type', 'Number'
                ]
            )

            chamber = {
                "House": 'lower',
                "Senate": 'upper'
            }[chamber]

            url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber],
                                              {"code": guid, "sid": sid})

            legislator = Person(
                name=full_name,
                district=str(district),
                party=party,
                primary_org=chamber,
                image=photo,
            )
            legislator.extras = {
                'last_name': last_name,
                'first_name': first_name,
                'guid': guid,
            }

            capitol_address = self.clean_list([
                member_info['Address'][x] for x in [
                    'Street', 'City', 'State', 'Zip'
                ]
            ])

            capitol_address = " ".join(
                addr_component for addr_component
                in capitol_address if addr_component
            ).strip()

            capitol_contact_info = self.clean_list([
                member_info['Address'][x] for x in [
                    'Email', 'Phone', 'Fax'
                ]
            ])

            # Sometimes email is set to a long cryptic string.
            # If it doesn't have a @ character, simply set it to None
            # examples:
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=xT8jBs5X4S7ZX2TOajTx2W7CBprTaVlpcvUvHEv78GI=
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=eSH9vpfdy3XJ989Gpw4MOdUa3n55NTA8ev58RPJuzA8=

            if capitol_contact_info[0] and '@' not in capitol_contact_info[0]:
                capitol_contact_info[0] = None

            # if we have more than 2 chars (eg state)
            # or a phone/fax/email address record the info
            if len(capitol_address) > 2 or not capitol_contact_info.count(None) == 3:
                if capitol_contact_info[0] and '*****@*****.**' in capitol_contact_info[0]:
                    self.warning("XXX: GA SITE WAS HACKED.")
                    capitol_contact_info[1] = None

                if capitol_address.strip():
                    legislator.add_contact_detail(
                        type='address', value=capitol_address, note='Capitol Address')
                if capitol_contact_info[1]:
                    legislator.add_contact_detail(
                        type='voice', value=capitol_contact_info[1], note='Capitol Address')
                if capitol_contact_info[2]:
                    legislator.add_contact_detail(
                        type='fax', value=capitol_contact_info[2], note='Capitol Address')
                if capitol_contact_info[0]:
                    legislator.add_contact_detail(
                        type='email', value=capitol_contact_info[0], note='Capitol Address')

            district_address = self.clean_list([
                member_info['DistrictAddress'][x] for x in [
                    'Street', 'City', 'State', 'Zip'
                ]
            ])

            district_contact_info = self.clean_list([
                member_info['DistrictAddress'][x] for x in [
                    'Email', 'Phone', 'Fax'
                ]
            ])

            # Same issue with district email. See above comment
            if district_contact_info[0] and '@' not in district_contact_info[0]:
                district_contact_info[0] = None

            district_address = " ".join(
                addr_component for addr_component
                in district_address if addr_component
            ).strip()

            if len(capitol_address) > 2 or not capitol_contact_info.count(None) == 3:
                if (district_contact_info[1] and
                        '*****@*****.**' in district_contact_info[1]):
                    self.warning("XXX: GA SITE WAS HACKED.")
                    district_contact_info[1] = None

                if district_address.strip():
                    legislator.add_contact_detail(
                        type='address', value=district_address, note='District Address')
                if district_contact_info[1]:
                    legislator.add_contact_detail(
                        type='voice', value=district_contact_info[1], note='District Address')
                if district_contact_info[2]:
                    legislator.add_contact_detail(
                        type='fax', value=district_contact_info[2], note='District Address')
                if district_contact_info[0]:
                    legislator.add_contact_detail(
                        type='email', value=district_contact_info[0], note='District Address')

            legislator.add_link(url)
            legislator.add_source(self.ssource)
            legislator.add_source(HOMEPAGE_URLS[chamber].format(
                **{"code": guid, "sid": sid}))

            yield legislator

Пример #10

Показать файл

Файл: people.py Проект: julianedwards/openstates

    def _parse_person(self, row, chamber, seat_map):
        # Capture legislator vitals.
        first_name = row['FirstName']
        middle_name = row['MiddleName']
        last_name = row['LastName']
        full_name = '{} {} {}'.format(first_name, middle_name, last_name)
        full_name = re.sub(r'[\s]{2,}', ' ', full_name)

        if chamber == 'lower':
            district = '{} {}'.format(row['County'],
                                      int(row['District'])).strip()
        else:
            district = str(int(row['District'])).strip()

        party = self.party_map[row['party'].upper()]
        email = row['WorkEmail']

        if district == '0':
            self.warning('Skipping {}, district is set to 0'.format(full_name))
            return

        # Temporary fix for Kari Lerner
        if district == 'Rockingham 0' and last_name == 'Lerner':
            district = 'Rockingham 4'

        person = Person(primary_org=chamber,
                        district=district,
                        name=full_name,
                        party=party)

        extras = {
            'first_name': first_name,
            'middle_name': middle_name,
            'last_name': last_name
        }

        person.extras = extras
        if email:
            person.add_contact_detail(type='email',
                                      value=email,
                                      note='District Office')

        # Capture legislator office contact information.
        district_address = '{}\n{}\n{}, {} {}'.format(row['Address'],
                                                      row['address2'],
                                                      row['city'],
                                                      row['State'],
                                                      row['Zipcode']).strip()

        phone = row['Phone'].strip()
        if not phone:
            phone = None

        if district_address:
            person.add_contact_detail(type='address',
                                      value=district_address,
                                      note='Home Office')
        if phone:
            person.add_contact_detail(type='voice',
                                      value=phone,
                                      note='Home Office')

        # Retrieve legislator portrait.
        profile_url = None
        if chamber == 'upper':
            profile_url = self.senate_profile_url.format(row['District'])
        elif chamber == 'lower':
            try:
                seat_number = seat_map[row['seatno']]
                profile_url = self.house_profile_url.format(seat_number)
            except KeyError:
                pass

        if profile_url:
            person.image = self._get_photo(profile_url, chamber)
            person.add_source(profile_url)

        return person

Пример #11

Показать файл

    def _parse_person(self, row, chamber, seat_map):
        # Capture legislator vitals.
        first_name = row["FirstName"]
        middle_name = row["MiddleName"]
        last_name = row["LastName"]
        full_name = "{} {} {}".format(first_name, middle_name, last_name)
        full_name = re.sub(r"[\s]{2,}", " ", full_name)

        if chamber == "lower":
            district = "{} {}".format(row["County"],
                                      int(row["District"])).strip()
        else:
            district = str(int(row["District"])).strip()

        party = self.party_map[row["party"].upper()]
        email = row["WorkEmail"]

        if district == "0":
            self.warning("Skipping {}, district is set to 0".format(full_name))
            return

        person = Person(primary_org=chamber,
                        district=district,
                        name=full_name,
                        party=party)

        extras = {
            "first_name": first_name,
            "middle_name": middle_name,
            "last_name": last_name,
        }

        person.extras = extras
        if email:
            office = "Capitol" if email.endswith(
                "@leg.state.nh.us") else "District"
            person.add_contact_detail(type="email",
                                      value=email,
                                      note=office + " Office")

        # Capture legislator office contact information.
        district_address = "{}\n{}\n{}, {} {}".format(row["Address"],
                                                      row["address2"],
                                                      row["city"],
                                                      row["State"],
                                                      row["Zipcode"]).strip()

        phone = row["Phone"].strip()
        if not phone:
            phone = None

        if district_address:
            office = "Capitol" if chamber == "upper" else "District"
            person.add_contact_detail(type="address",
                                      value=district_address,
                                      note=office + " Office")
        if phone:
            office = "Capitol" if "271-" in phone else "District"
            person.add_contact_detail(type="voice",
                                      value=phone,
                                      note=office + " Office")

        # Retrieve legislator portrait.
        profile_url = None
        if chamber == "upper":
            profile_url = self.senate_profile_url.format(row["District"])
        elif chamber == "lower":
            try:
                seat_number = seat_map[row["seatno"]]
                profile_url = self.house_profile_url.format(seat_number)
            except KeyError:
                pass

        if profile_url:
            person.image = self._get_photo(profile_url, chamber)
            person.add_source(profile_url)

        return person

Пример #12

Показать файл

    def scrape_session(self, session, chambers):
        sid = SESSION_SITE_IDS[session]
        members = backoff(self.sservice.GetMembersBySession,
                          sid)['MemberListing']

        for member in members:
            guid = member['Id']
            member_info = backoff(self.sservice.GetMember, guid)

            # Check to see if the member has vacated; skip if so:
            try:
                legislative_service = next(
                    service for service in member_info['SessionsInService']
                    ['LegislativeService'] if service['Session']['Id'] == sid)
            except IndexError:
                raise Exception("Something very bad is going on with the "
                                "Legislative service")

            if legislative_service['DateVacated']:
                continue

            nick_name, first_name, middle_name, last_name = (
                member_info['Name'][x]
                for x in ['Nickname', 'First', 'Middle', 'Last'])

            first_name = nick_name if nick_name else first_name

            if middle_name:
                full_name = "%s %s %s" % (first_name, middle_name, last_name)
            else:
                full_name = "%s %s" % (first_name, last_name)

            party = legislative_service['Party']

            if party == 'Democrat':
                party = 'Democratic'

            elif party.strip() == '':
                party = 'other'

            chamber, district = (legislative_service['District'][x]
                                 for x in ['Type', 'Number'])

            chamber = {"House": 'lower', "Senate": 'upper'}[chamber]

            url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber], {
                "code": guid,
                "sid": sid
            })

            legislator = Person(
                name=full_name,
                district=str(district),
                party=party,
                primary_org=chamber,
                image=photo,
            )
            legislator.extras = {
                'last_name': last_name,
                'first_name': first_name,
                'guid': guid,
            }

            capitol_address = self.clean_list([
                member_info['Address'][x]
                for x in ['Street', 'City', 'State', 'Zip']
            ])

            capitol_address = " ".join(addr_component
                                       for addr_component in capitol_address
                                       if addr_component).strip()

            capitol_contact_info = self.clean_list(
                [member_info['Address'][x] for x in ['Email', 'Phone', 'Fax']])

            # Sometimes email is set to a long cryptic string.
            # If it doesn't have a @ character, simply set it to None
            # examples:
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=xT8jBs5X4S7ZX2TOajTx2W7CBprTaVlpcvUvHEv78GI=
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=eSH9vpfdy3XJ989Gpw4MOdUa3n55NTA8ev58RPJuzA8=

            if capitol_contact_info[0] and '@' not in capitol_contact_info[0]:
                capitol_contact_info[0] = None

            # if we have more than 2 chars (eg state)
            # or a phone/fax/email address record the info
            if len(capitol_address) > 2 or not capitol_contact_info.count(
                    None) == 3:
                if capitol_contact_info[
                        0] and '*****@*****.**' in capitol_contact_info[
                            0]:
                    self.warning("XXX: GA SITE WAS HACKED.")
                    capitol_contact_info[1] = None

                if capitol_address.strip():
                    legislator.add_contact_detail(type='address',
                                                  value=capitol_address,
                                                  note='Capitol Address')
                if capitol_contact_info[1]:
                    legislator.add_contact_detail(
                        type='voice',
                        value=capitol_contact_info[1],
                        note='Capitol Address')
                if capitol_contact_info[2]:
                    legislator.add_contact_detail(
                        type='fax',
                        value=capitol_contact_info[2],
                        note='Capitol Address')
                if capitol_contact_info[0]:
                    legislator.add_contact_detail(
                        type='email',
                        value=capitol_contact_info[0],
                        note='Capitol Address')

            district_address = self.clean_list([
                member_info['DistrictAddress'][x]
                for x in ['Street', 'City', 'State', 'Zip']
            ])

            district_contact_info = self.clean_list([
                member_info['DistrictAddress'][x]
                for x in ['Email', 'Phone', 'Fax']
            ])

            # Same issue with district email. See above comment
            if district_contact_info[0] and '@' not in district_contact_info[0]:
                district_contact_info[0] = None

            district_address = " ".join(addr_component
                                        for addr_component in district_address
                                        if addr_component).strip()

            if len(capitol_address) > 2 or not capitol_contact_info.count(
                    None) == 3:
                if (district_contact_info[1] and '*****@*****.**'
                        in district_contact_info[1]):
                    self.warning("XXX: GA SITE WAS HACKED.")
                    district_contact_info[1] = None

                if district_address.strip():
                    legislator.add_contact_detail(type='address',
                                                  value=district_address,
                                                  note='District Address')
                if district_contact_info[1]:
                    legislator.add_contact_detail(
                        type='voice',
                        value=district_contact_info[1],
                        note='District Address')
                if district_contact_info[2]:
                    legislator.add_contact_detail(
                        type='fax',
                        value=district_contact_info[2],
                        note='District Address')
                if district_contact_info[0]:
                    legislator.add_contact_detail(
                        type='email',
                        value=district_contact_info[0],
                        note='District Address')

            legislator.add_link(url)
            legislator.add_source(self.ssource)
            legislator.add_source(HOMEPAGE_URLS[chamber].format(**{
                "code": guid,
                "sid": sid
            }))

            yield legislator

Пример #13

Показать файл

    def transform_parse(self, parsed_form, response):

        _source = {
            "url": response.url,
            "note": "LDA Form LD-1"
        }

        # basic disclosure fields
        _disclosure = Disclosure(
            effective_date=datetime.strptime(
                parsed_form['datetimes']['effective_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            timezone='America/New_York',
            submitted_date=datetime.strptime(
                parsed_form['datetimes']['signature_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            classification="lobbying"
        )

        _disclosure.add_authority(name=self.authority.name,
                                  type=self.authority._type,
                                  id=self.authority._id)

        _disclosure.add_identifier(
            identifier=parsed_form['_meta']['document_id'],
            scheme="urn:sopr:filing"
        )

        # disclosure extras
        _disclosure.extras = {}
        _disclosure.extras['registrant'] = {
            'self_employed_individual': parsed_form['registrant']['self_employed_individual'],
            'general_description': parsed_form['registrant']['registrant_general_description'],
            'signature': {
                "signature_date": parsed_form['datetimes']['signature_date'],
                "signature": parsed_form['signature']
            }
        }

        _disclosure.extras['client'] = {
            'same_as_registrant':
                parsed_form['client']['client_self'],
            'general_description':
                parsed_form['client']['client_general_description']
        }

        _disclosure.extras['registration_type'] = {
            'is_amendment':
                parsed_form['registration_type']['is_amendment'],
            'new_registrant':
                parsed_form['registration_type']['new_registrant'],
            'new_client_for_existing_registrant':
                parsed_form['registration_type'][
                    'new_client_for_existing_registrant'],
        }

        # # Registrant
        # build registrant
        _registrant_self_employment = None

        if parsed_form['registrant']['self_employed_individual']:
            n = ' '.join([p for p in [
                parsed_form['registrant']['registrant_individual_prefix'],
                parsed_form['registrant']['registrant_individual_firstname'],
                parsed_form['registrant']['registrant_individual_lastname']
            ] if len(p) > 0]).strip()

            _registrant = Person(
                name=n,
                source_identified=True
            )

            _registrant_self_employment = Organization(
                name='SELF-EMPLOYMENT of {n}'.format(n=n),
                classification='company',
                source_identified=True
            )

            _registrant.add_membership(
                organization=_registrant_self_employment,
                role='self_employed',
                label='self-employment of {n}'.format(n=n),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )
        else:
            _registrant = Organization(
                name=parsed_form['registrant']['registrant_org_name'],
                classification='company',
                source_identified=True
            )

        if len(parsed_form['registrant']['registrant_house_id']) > 0:
            _registrant.add_identifier(
                identifier=parsed_form['registrant']['registrant_house_id'],
                scheme='urn:house_clerk:registrant'
            )

        if len(parsed_form['registrant']['registrant_senate_id']) > 0:
            _registrant.add_identifier(
                identifier=parsed_form['registrant']['registrant_senate_id'],
                scheme='urn:sopr:registrant'
            )

        registrant_contact_details = [
            {
                "type": "address",
                "note": "contact address",
                "value": '; '.join([
                    p for p in [
                        parsed_form['registrant']['registrant_address_one'],
                        parsed_form['registrant']['registrant_address_two'],
                        parsed_form['registrant']['registrant_city'],
                        parsed_form['registrant']['registrant_state'],
                        parsed_form['registrant']['registrant_zip'],
                        parsed_form['registrant']['registrant_country']]
                    if len(p) > 0]).strip(),
            },
            {
                "type": "voice",
                "note": "contact phone",
                "value": parsed_form['registrant']['registrant_contact_phone'],
            },
            {
                "type": "email",
                "note": "contact email",
                "value": parsed_form['registrant']['registrant_contact_email'],
            },
        ]

        registrant_contact_ppb = {
            "type": "address",
            "note": "principal place of business",
            "value": '; '.join([
                p for p in [
                    parsed_form['registrant']['registrant_ppb_city'],
                    parsed_form['registrant']['registrant_ppb_state'],
                    parsed_form['registrant']['registrant_ppb_zip'],
                    parsed_form['registrant']['registrant_ppb_country']]
                if len(p) > 0]).strip(),
        }

        if registrant_contact_ppb["value"]:
            registrant_contact_details.append(registrant_contact_ppb)

        for cd in registrant_contact_details:
            _registrant.add_contact_detail(**cd)

        _registrant.extras = {
            "contact_details_structured": [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address_one",
                            "value": parsed_form['registrant'][
                                'registrant_address_one'],
                        },
                        {
                            "note": "address_two",
                            "value": parsed_form['registrant'][
                                'registrant_address_two'],
                        },
                        {
                            "note": "city",
                            "value": parsed_form['registrant'][
                                'registrant_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['registrant'][
                                'registrant_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['registrant'][
                                'registrant_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['registrant'][
                                'registrant_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_country'],
                        }
                    ],
                },
            ]
        }

        # # People
        # build contact
        _main_contact = Person(
            name=parsed_form['registrant']['registrant_contact_name'],
            source_identified=True
        )

        main_contact_contact_details = [
            {
                "type": "voice",
                "note": "contact phone",
                "value": parsed_form['registrant']['registrant_contact_phone'],
            },
            {
                "type": "email",
                "note": "contact email",
                "value": parsed_form['registrant']['registrant_contact_email'],
            }
        ]

        for cd in main_contact_contact_details:
            _main_contact.add_contact_detail(**cd)

        if _registrant._type == 'organization':
            _registrant.add_member(
                name_or_person=_main_contact,
                role='main_contact',
                label='main contact for {n}'.format(n=_registrant.name),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )
        else:
            _registrant_self_employment.add_member(
                name_or_person=_main_contact,
                role='main_contact',
                label='main contact for {n}'.format(n=_registrant.name),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )

        # # Client
        # build client
        _client = Organization(
            name=parsed_form['client']['client_name'],
            classification='company',
            source_identified=True
        )

        client_contact_details = [
            {
                "type": "address",
                "note": "contact address",
                "value": '; '.join([
                    p for p in [
                        parsed_form['client']['client_address'],
                        parsed_form['client']['client_city'],
                        parsed_form['client']['client_state'],
                        parsed_form['client']['client_zip'],
                        parsed_form['client']['client_country']]
                    if len(p) > 0]).strip(),
            },
        ]

        client_contact_ppb = {
            "type": "address",
            "note": "principal place of business",
            "value": '; '.join([
                p for p in [
                    parsed_form['client']['client_ppb_city'],
                    parsed_form['client']['client_ppb_state'],
                    parsed_form['client']['client_ppb_zip'],
                    parsed_form['client']['client_ppb_country']]
                if len(p) > 0]).strip(),
        }

        if client_contact_ppb["value"]:
            client_contact_details.append(client_contact_ppb)

        for cd in client_contact_details:
            _client.add_contact_detail(**cd)

        _client.extras = {
            "contact_details_structured": [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": parsed_form['client']['client_address'],
                        },
                        {
                            "note": "city",
                            "value": parsed_form['client']['client_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['client']['client_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['client']['client_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['client']['client_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value": parsed_form['client']['client_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['client']['client_ppb_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['client']['client_ppb_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['client'][
                                'client_ppb_country'],
                        }
                    ],
                },
            ],
        }

        # Collect Foreign Entities
        _foreign_entities = []
        _foreign_entities_by_name = {}
        for fe in parsed_form['foreign_entities']:
            fe_extras = {}
            fe_name = fe['foreign_entity_name']

            # check for name-based duplicates
            if fe_name in _foreign_entities_by_name:
                _foreign_entity = _foreign_entities_by_name[fe_name]
            else:
                _foreign_entity = Organization(
                    name=fe_name,
                    classification='company',
                    source_identified=True
                )

            # collect contact details
            foreign_entity_contact_details = [
                {
                    "type": "address",
                    "note": "contact address",
                    "value": '; '.join([
                        p for p in [
                            fe['foreign_entity_address'],
                            fe['foreign_entity_city'],
                            fe['foreign_entity_state'],
                            fe['foreign_entity_country']]
                        if len(p) > 0]).strip(),
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "value": '; '.join([
                        p for p in [
                            fe['foreign_entity_ppb_state'],
                            fe['foreign_entity_ppb_country']]
                        if len(p) > 0]).strip(),
                },
            ]

            foreign_entity_contact_ppb = {
                "type": "address",
                "note": "principal place of business",
                "value": '; '.join([
                    p for p in [
                        fe['foreign_entity_ppb_city'],
                        fe['foreign_entity_ppb_state'],
                        fe['foreign_entity_ppb_country']]
                    if len(p) > 0]),
            }

            if foreign_entity_contact_ppb["value"]:
                foreign_entity_contact_details.append(
                    foreign_entity_contact_ppb)

            # add contact details
            for cd in foreign_entity_contact_details:
                if cd['value'] != '':
                    _foreign_entity.add_contact_detail(**cd)

            # add extras
            fe_extras["contact_details_structured"] = [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": fe['foreign_entity_address'],
                        },
                        {
                            "note": "city",
                            "value": fe['foreign_entity_city'],
                        },
                        {
                            "note": "state",
                            "value": fe['foreign_entity_state'],
                        },
                        {
                            "note": "country",
                            "value": fe['foreign_entity_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "state",
                            "value": fe['foreign_entity_ppb_state'],
                        },
                        {
                            "note": "country",
                            "value": fe['foreign_entity_ppb_country'],
                        }
                    ],
                },
            ]

            _foreign_entity.extras = combine_dicts(_foreign_entity.extras,
                                                   fe_extras)

            _foreign_entities_by_name[fe_name] = _foreign_entity

        for unique_foreign_entity in _foreign_entities_by_name.values():
            _foreign_entities.append(unique_foreign_entity)

            # TODO: add a variant on memberships to represent inter-org
            # relationships (associations, ownership, etc)
            #
            # _client['memberships'].append({
            #     "id": _foreign_entity['id'],
            #     "classification": "organization",
            #     "name": _foreign_entity['name'],
            #     "extras": {
            #         "ownership_percentage":
            #             fe['foreign_entity_amount']
            #     }
            # })

        # Collect Lobbyists
        # TODO: deal with wierd non-name line continuation cases (blanks, "continued")
        _lobbyists_by_name = {}

        for l in parsed_form['lobbyists']:
            l_extras = {}
            l_name = ' '.join([l['lobbyist_first_name'],
                               l['lobbyist_last_name'],
                               l['lobbyist_suffix']
                               ]).strip()

            if l_name in _lobbyists_by_name:
                _lobbyist = _lobbyists_by_name[l_name]
            else:
                _lobbyist = Person(
                    name=l_name,
                    source_identified=True
                )

            if l['lobbyist_covered_official_position']:
                l_extras['lda_covered_official_positions'] = [
                    {
                        'date_reported':
                            parsed_form['datetimes']['effective_date'],
                        'covered_official_position':
                            l['lobbyist_covered_official_position']
                    },
                ]

            _lobbyist.extras = combine_dicts(_lobbyist.extras, l_extras)

            _lobbyists_by_name[l_name] = _lobbyist

        _lobbyists = []
        for unique_lobbyist in _lobbyists_by_name.values():
            _lobbyists.append(unique_lobbyist)

        if _registrant._type == 'organization':
            for l in _lobbyists:
                _registrant.add_member(
                    l,
                    role='lobbyist',
                    label='lobbyist for {n}'.format(n=_registrant.name),
                    start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
                )
        else:
            for l in _lobbyists:
                _registrant_self_employment.add_member(
                    l,
                    role='lobbyist',
                    label='lobbyist for {n}'.format(n=_registrant.name),
                    start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
                )

        # # Document
        # build document
        _disclosure.add_document(
            note='submitted filing',
            date=parsed_form['datetimes']['effective_date'][:10],
            url=response.url
        )

        # Collect Affiliated orgs
        _affiliated_organizations = []
        _affiliated_organizations_by_name = {}
        for ao in parsed_form['affiliated_organizations']:
            ao_extras = {}
            ao_name = ao['affiliated_organization_name']
            if ao_name in _affiliated_organizations_by_name:
                # There's already one by this name
                _affiliated_organization = _affiliated_organizations_by_name[ao_name]
            else:
                # New affiliated org
                _affiliated_organization = Organization(
                    name=ao_name,
                    classification='company',
                    source_identified=True
                )

            # collect contact details
            affiliated_organization_contact_details = [
                {
                    "type": "address",
                    "note": "contact address",
                    "value": '; '.join([
                        p for p in [
                            ao['affiliated_organization_address'],
                            ao['affiliated_organization_city'],
                            ao['affiliated_organization_state'],
                            ao['affiliated_organization_zip'],
                            ao['affiliated_organization_country']]
                        if len(p) > 0]).strip(),
                },
            ]

            affiliated_organization_contact_ppb = {
                "type": "address",
                "note": "principal place of business",
                "value": '; '.join([
                    p for p in [
                        ao['affiliated_organization_ppb_city'],
                        ao['affiliated_organization_ppb_state'],
                        ao['affiliated_organization_ppb_country']]
                    if len(p) > 0]).strip(),
            }

            if affiliated_organization_contact_ppb["value"]:
                affiliated_organization_contact_details.append(
                    affiliated_organization_contact_ppb)

            # add contact details
            for cd in affiliated_organization_contact_details:
                _affiliated_organization.add_contact_detail(**cd)

            ao_extras["contact_details_structured"] = [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": ao['affiliated_organization_address'],
                        },
                        {
                            "note": "city",
                            "value": ao['affiliated_organization_city'],
                        },
                        {
                            "note": "state",
                            "value": ao['affiliated_organization_state'],
                        },
                        {
                            "note": "zip",
                            "value": ao['affiliated_organization_zip'],
                        },
                        {
                            "note": "country",
                            "value": ao['affiliated_organization_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value":
                                ao['affiliated_organization_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value":
                                ao['affiliated_organization_ppb_state'],
                        },
                        {
                            "note": "country",
                            "value":
                                ao['affiliated_organization_ppb_country'],
                        }
                    ],
                },
            ],

            _affiliated_organization.extras = combine_dicts(
                _affiliated_organization.extras, ao_extras)

        for unique_affiliated_organization in _affiliated_organizations_by_name.values():
            _affiliated_organizations.append(unique_affiliated_organization)

        # # Events & Agendas
        # name
        if parsed_form['registration_type']['new_registrant']:
            registration_type = 'New Client, New Registrant'
        elif parsed_form['registration_type']['is_amendment']:
            registration_type = 'Amended Registration'
        else:
            registration_type = 'New Client for Existing Registrant'

        # Create registration event
        _event = Event(
            name="{rn} - {rt}, {cn}".format(rn=_registrant.name,
                                            rt=registration_type,
                                            cn=_client.name),
            timezone='America/New_York',
            location='United States',
            start_time=datetime.strptime(
                parsed_form['datetimes']['effective_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            classification='registration'
        )

        # add participants
        _event.add_participant(type=_registrant._type,
                               id=_registrant._id,
                               name=_registrant.name,
                               note="registrant")

        if _registrant._type == 'person':
            _event.add_participant(type=_registrant._type,
                                   id=_registrant._id,
                                   name=_registrant.name,
                                   note="registrant")

        _event.add_participant(type=_client._type,
                               id=_client._id,
                               name=_client.name,
                               note="client")

        for l in _lobbyists:
            _event.add_participant(type=l._type,
                                   id=l._id,
                                   name=l.name,
                                   note='lobbyist')

        for fe in _foreign_entities:
            _event.add_participant(type=fe._type,
                                   id=fe._id,
                                   name=fe.name,
                                   note='foreign_entity')

        for ao in _affiliated_organizations:
            _event.add_participant(type=ao._type,
                                   id=ao._id,
                                   name=ao.name,
                                   note='affiliated_organization')

        # add agenda item
        _agenda = _event.add_agenda_item(
            description='issues lobbied on',
        )

        _agenda['notes'].append(
            parsed_form['lobbying_issues_detail']
        )

        for li in parsed_form['lobbying_issues']:
            if li['general_issue_area'] != '':
                _agenda.add_subject(li['general_issue_area'])

        _disclosure.add_disclosed_event(
            name=_event.name,
            type=_event._type,
            classification=_event.classification,
            id=_event._id
        )

        # add registrant to disclosure's _related and related_entities fields
        _disclosure.add_registrant(name=_registrant.name,
                                   type=_registrant._type,
                                   id=_registrant._id)

        _registrant.add_source(
            url=_source['url'],
            note='registrant'
        )
        yield _registrant

        if _registrant_self_employment is not None:
            _registrant_self_employment.add_source(
                url=_source['url'],
                note='registrant_self_employment'
            )

            yield _registrant_self_employment

        _client.add_source(
            url=_source['url'],
            note='client'
        )
        yield _client

        _main_contact.add_source(
            url=_source['url'],
            note='main_contact'
        )
        yield _main_contact

        for ao in _affiliated_organizations:
            ao.add_source(
                url=_source['url'],
                note='affiliated_organization'
            )
            yield ao
        for fe in _foreign_entities:
            fe.add_source(
                url=_source['url'],
                note='foreign_entity'
            )
            yield fe
        for l in _lobbyists:
            l.add_source(
                url=_source['url'],
                note='lobbyist'
            )
            yield l

        _event.add_source(**_source)
        yield _event
        _disclosure.add_source(**_source)
        yield _disclosure

Пример #14

Показать файл

    def _scrape_lower_chamber(self):
        self.info("Scraping lower chamber for legislators.")

        chamber = "lower"

        roster_url = self._reps_url
        page = self.get(roster_url).text
        page = lxml.html.fromstring(page)
        # This is the ASP.net table container
        table_xpath = "//table[@id='theTable']"
        table = page.xpath(table_xpath)[0]
        for tr in table.xpath("tr")[3:]:
            # If a given term hasn't occurred yet, then ignore it
            # Eg, in 2017, the 2018 term page will have a blank table
            if tr.attrib.get("class") == "dxgvEmptyDataRow":
                self.warning("No House members found")
                return

            tds = tr.xpath("td")
            last_name = tds[1].text_content().strip()
            first_name = tds[2].text_content().strip()
            full_name = "{} {}".format(first_name, last_name)
            district = str(int(tds[3].text_content().strip()))
            party = tds[4].text_content().strip()
            if party == "D":
                party = "Democratic"
            elif party == "R":
                party = "Republican"

            if party.strip() == "":  # Workaround for now.
                party = "Other"

            phone = tds[6].text_content().strip()
            room = tds[7].text_content().strip()

            address = self._assumed_address_fmt.format(room if room else "")

            if last_name == "Vacant":
                person = Person(name=full_name,
                                primary_org=chamber,
                                district=district,
                                party=party)
                person.extras = {
                    "first_name": first_name,
                    "last_name": last_name
                }

                person.add_contact_detail(type="address",
                                          value=address,
                                          note="Capitol Office")
                if phone.strip():
                    person.add_contact_detail(type="voice",
                                              value=phone,
                                              note="Capitol Office")

                person.add_source(roster_url)

                self._save_vacant_legislator(person)
            else:
                party_override = {
                    " Green": "Democratic",
                    " Sisco": "Republican"
                }

                if party == "" and full_name in party_override:
                    party = party_override[full_name]

                details_url = self._rep_details_url.format(district)
                details_page = lxml.html.fromstring(self.get(details_url).text)

                person = Person(name=full_name,
                                primary_org=chamber,
                                district=district,
                                party=party)
                person.extras = {
                    "first_name": first_name,
                    "last_name": last_name
                }
                person.add_source(roster_url)
                person.add_source(details_url)
                person.add_link(details_url)

                email = details_page.xpath(
                    '//*[@id="ContentPlaceHolder1_lblAddresses"] '
                    '//a[starts-with(@href,"mailto:")]/@href')
                if len(email) > 0 and email[0].lower() != "mailto:":
                    email = email[0].split(":")[1]
                else:
                    email = None

                person.add_contact_detail(type="address",
                                          value=address,
                                          note="Capitol Office")
                if phone:
                    person.add_contact_detail(type="voice",
                                              value=phone,
                                              note="Capitol Office")
                if email:
                    person.add_contact_detail(type="email",
                                              value=email,
                                              note="Capitol Office")

                picture = details_page.xpath(
                    '//*[@id="ContentPlaceHolder1_imgPhoto"]/@src')
                if len(picture) > 0:
                    person.image = picture[0]

                yield person

Пример #15

Показать файл

    def scrape_session(self, session, chambers):
        sid = SESSION_SITE_IDS[session]
        members = backoff(self.sservice.GetMembersBySession,
                          sid)["MemberListing"]

        seen_guids = []
        for member in members:
            guid = member["Id"]
            member_info = backoff(self.sservice.GetMember, guid)

            # If a member switches chambers during the session, they may
            # appear twice. Skip the duplicate record accordingly.
            if guid in seen_guids:
                self.warning("Skipping duplicate record of {}".format(
                    member_info["Name"]["Last"]))
                continue
            else:
                seen_guids.append(guid)

            # Check to see if the member has vacated; skip if so.
            # A member can have multiple services for a given session,
            # if they switched chambers. Filter these down to just the
            # active service.
            try:
                (legislative_service, ) = [
                    service for service in member_info["SessionsInService"]
                    ["LegislativeService"] if service["Session"]["Id"] == sid
                    and service["DateVacated"] is None
                ]
            except ValueError:
                self.info("Skipping retired member {}".format(
                    member_info["Name"]["Last"]))
                continue

            nick_name, first_name, middle_name, last_name = (
                member_info["Name"][x]
                for x in ["Nickname", "First", "Middle", "Last"])

            first_name = nick_name if nick_name else first_name

            if middle_name:
                full_name = "%s %s %s" % (first_name, middle_name, last_name)
            else:
                full_name = "%s %s" % (first_name, last_name)

            party = legislative_service["Party"]

            if party == "Democrat":
                party = "Democratic"

            elif party.strip() == "":
                party = "other"

            chamber, district = (legislative_service["District"][x]
                                 for x in ["Type", "Number"])

            chamber = {"House": "lower", "Senate": "upper"}[chamber]

            url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber], {
                "code": guid,
                "sid": sid
            })

            legislator = Person(
                name=full_name,
                district=str(district),
                party=party,
                primary_org=chamber,
                image=photo,
            )
            legislator.extras = {
                "family_name": last_name,
                "given_name": first_name,
                "guid": guid,
            }

            if (member_info["Address"]["Street"] is not None
                    and member_info["Address"]["Street"].strip()):
                capitol_address_info = {
                    k: v.strip()
                    for k, v in dict(member_info["Address"]).items()
                    if k in ["Street", "City", "State", "Zip"]
                }
                capitol_address = "{Street}\n{City}, {State} {Zip}".format(
                    **capitol_address_info)
                legislator.add_contact_detail(type="address",
                                              value=capitol_address,
                                              note="Capitol Address")
            else:
                self.warning(
                    "Could not find full capitol address for {}".format(
                        full_name))

            capitol_contact_info = self.clean_list(
                [member_info["Address"][x] for x in ["Email", "Phone", "Fax"]])

            # Sometimes email is set to a long cryptic string.
            # If it doesn't have a @ character, simply set it to None
            # examples:
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=xT8jBs5X4S7ZX2TOajTx2W7CBprTaVlpcvUvHEv78GI=
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=eSH9vpfdy3XJ989Gpw4MOdUa3n55NTA8ev58RPJuzA8=
            if capitol_contact_info[0] and "@" not in capitol_contact_info[0]:
                capitol_contact_info[0] = None

            if capitol_contact_info[0]:
                # Site was hacked in the past
                assert "*****@*****.**" not in capitol_contact_info[0]

            if capitol_contact_info[1]:
                legislator.add_contact_detail(type="voice",
                                              value=capitol_contact_info[1],
                                              note="Capitol Address")
            if capitol_contact_info[2]:
                legislator.add_contact_detail(type="fax",
                                              value=capitol_contact_info[2],
                                              note="Capitol Address")
            if capitol_contact_info[0]:
                legislator.add_contact_detail(type="email",
                                              value=capitol_contact_info[0],
                                              note="Capitol Address")

            if (member_info["DistrictAddress"]["Street"] is not None
                    and member_info["DistrictAddress"]["Street"].strip()):
                district_address_info = {
                    k: v.strip()
                    for k, v in dict(member_info["DistrictAddress"]).items()
                    if k in ["Street", "City", "State", "Zip"]
                }
                district_address = "{Street}\n{City}, {State} {Zip}".format(
                    **district_address_info)
                legislator.add_contact_detail(type="address",
                                              value=district_address,
                                              note="District Address")
            else:
                self.warning(
                    "Could not find full district address for {}".format(
                        full_name))

            district_contact_info = self.clean_list([
                member_info["DistrictAddress"][x]
                for x in ["Email", "Phone", "Fax"]
            ])

            # Same issue with district email. See above comment
            if district_contact_info[0] and "@" not in district_contact_info[0]:
                district_contact_info[0] = None

            if district_contact_info[0]:
                # Site was hacked in the past
                assert "*****@*****.**" not in district_contact_info[0]

            if district_contact_info[1]:
                legislator.add_contact_detail(
                    type="voice",
                    value=district_contact_info[1],
                    note="District Address",
                )
            if district_contact_info[2]:
                legislator.add_contact_detail(type="fax",
                                              value=district_contact_info[2],
                                              note="District Address")
            if district_contact_info[0]:
                legislator.add_contact_detail(
                    type="email",
                    value=district_contact_info[0],
                    note="District Address",
                )

            legislator.add_link(url)
            legislator.add_source(self.ssource)
            legislator.add_source(HOMEPAGE_URLS[chamber].format(**{
                "code": guid,
                "sid": sid
            }))

            yield legislator

Пример #16

Показать файл

Файл: people.py Проект: cliftonmcintosh/openstates

    def _scrape_lower_chamber(self):
        self.info('Scraping lower chamber for legislators.')

        chamber = 'lower'

        roster_url = (self._reps_url)
        page = self.get(roster_url).text
        page = lxml.html.fromstring(page)
        # This is the ASP.net table container
        table_xpath = ('id("ContentPlaceHolder1_'
                       'gridMembers_DXMainTable")')
        table = page.xpath(table_xpath)[0]
        for tr in table.xpath('tr')[1:]:
            # If a given term hasn't occurred yet, then ignore it
            # Eg, in 2017, the 2018 term page will have a blank table
            if tr.attrib.get('class') == 'dxgvEmptyDataRow':
                self.warning('No House members found')
                return

            tds = tr.xpath('td')
            last_name = tds[0].text_content().strip()
            first_name = tds[1].text_content().strip()
            full_name = '{} {}'.format(first_name, last_name)
            district = str(int(tds[2].text_content().strip()))
            party = tds[3].text_content().strip()
            if party == 'Democrat':
                party = 'Democratic'

            if party.strip() == "":  # Workaround for now.
                party = "Other"

            phone = tds[4].text_content().strip()
            room = tds[5].text_content().strip()
            address = self._assumed_address_fmt.format(room if room else '')

            if last_name == 'Vacant':
                person = Person(
                    name=full_name,
                    primary_org=chamber,
                    district=district,
                    party=party,
                )
                person.extras = {
                    'first_name': first_name,
                    'last_name': last_name,
                }

                person.add_contact_detail(type='address', value=address, note='Capitol Office')
                if phone.strip():
                    person.add_contact_detail(type='voice', value=phone, note='Capitol Office')

                person.add_source(roster_url)

                self._save_vacant_legislator(person)
            else:
                party_override = {" Green": "Democratic",
                                  " Sisco": "Republican"}

                if party == "" and full_name in party_override:
                    party = party_override[full_name]

                details_url = self._rep_details_url.format(district)
                details_page = lxml.html.fromstring(self.get(details_url).text)

                person = Person(
                    name=full_name,
                    primary_org=chamber,
                    district=district,
                    party=party,
                )
                person.extras = {
                    'first_name': first_name,
                    'last_name': last_name,
                }
                person.add_source(roster_url)
                person.add_source(details_url)
                person.add_link(details_url)

                email = details_page.xpath(
                    '//*[@id="ContentPlaceHolder1_lblAddresses"]'
                    '/table/tr[4]/td/a/@href'
                )
                if len(email) > 0 and email[0].lower() != 'mailto:':
                    email = email[0].split(':')[1]
                else:
                    email = None

                person.add_contact_detail(type='address', value=address, note='Capitol Office')
                if phone:
                    person.add_contact_detail(type='voice', value=phone, note='Capitol Office')
                if email:
                    person.add_contact_detail(type='email', value=email, note='Capitol Office')

                picture = details_page.xpath(
                    '//*[@id="ContentPlaceHolder1_imgPhoto"]/@src')
                if len(picture) > 0:
                    person.image = picture[0]

                yield person

Пример #17

Показать файл

Файл: people.py Проект: sunlightlabs/openstates

    def scrape_session(self, session, chambers):
        sid = SESSION_SITE_IDS[session]
        members = backoff(
            self.sservice.GetMembersBySession,
            sid
        )['MemberListing']

        seen_guids = []
        for member in members:
            guid = member['Id']
            member_info = backoff(self.sservice.GetMember, guid)

            # If a member switches chambers during the session, they may
            # appear twice. Skip the duplicate record accordingly.
            if guid in seen_guids:
                self.warning('Skipping duplicate record of {}'.format(member_info['Name']['Last']))
                continue
            else:
                seen_guids.append(guid)

            # Check to see if the member has vacated; skip if so.
            # A member can have multiple services for a given session,
            # if they switched chambers. Filter these down to just the
            # active service.
            try:
                (legislative_service, ) = [
                    service for service
                    in member_info['SessionsInService']['LegislativeService']
                    if service['Session']['Id'] == sid and service['DateVacated'] is None
                ]
            except ValueError:
                self.info('Skipping retired member {}'.format(member_info['Name']['Last']))
                continue

            nick_name, first_name, middle_name, last_name = (
                member_info['Name'][x] for x in [
                    'Nickname', 'First', 'Middle', 'Last'
                ]
            )

            first_name = nick_name if nick_name else first_name

            if middle_name:
                full_name = "%s %s %s" % (first_name, middle_name, last_name)
            else:
                full_name = "%s %s" % (first_name, last_name)

            party = legislative_service['Party']

            if party == 'Democrat':
                party = 'Democratic'

            elif party.strip() == '':
                party = 'other'

            chamber, district = (
                legislative_service['District'][x] for x in [
                    'Type', 'Number'
                ]
            )

            chamber = {
                "House": 'lower',
                "Senate": 'upper'
            }[chamber]

            url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber],
                                              {"code": guid, "sid": sid})

            legislator = Person(
                name=full_name,
                district=str(district),
                party=party,
                primary_org=chamber,
                image=photo,
            )
            legislator.extras = {
                'family_name': last_name,
                'given_name': first_name,
                'guid': guid,
            }

            if member_info['Address']['Street'] is not None and \
                    member_info['Address']['Street'].strip():
                capitol_address_info = {
                    k: v.strip() for k, v
                    in dict(member_info['Address']).items()
                    if k in ['Street', 'City', 'State', 'Zip']
                }
                capitol_address = '{Street}\n{City}, {State} {Zip}'.format(**capitol_address_info)
                legislator.add_contact_detail(
                    type='address', value=capitol_address, note='Capitol Address')
            else:
                self.warning('Could not find full capitol address for {}'.format(full_name))

            capitol_contact_info = self.clean_list([
                member_info['Address'][x] for x in [
                    'Email', 'Phone', 'Fax'
                ]
            ])

            # Sometimes email is set to a long cryptic string.
            # If it doesn't have a @ character, simply set it to None
            # examples:
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=xT8jBs5X4S7ZX2TOajTx2W7CBprTaVlpcvUvHEv78GI=
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=eSH9vpfdy3XJ989Gpw4MOdUa3n55NTA8ev58RPJuzA8=
            if capitol_contact_info[0] and '@' not in capitol_contact_info[0]:
                capitol_contact_info[0] = None

            if capitol_contact_info[0]:
                # Site was hacked in the past
                assert '*****@*****.**' not in capitol_contact_info[0]

            if capitol_contact_info[1]:
                legislator.add_contact_detail(
                    type='voice', value=capitol_contact_info[1], note='Capitol Address')
            if capitol_contact_info[2]:
                legislator.add_contact_detail(
                    type='fax', value=capitol_contact_info[2], note='Capitol Address')
            if capitol_contact_info[0]:
                legislator.add_contact_detail(
                    type='email', value=capitol_contact_info[0], note='Capitol Address')

            if member_info['DistrictAddress']['Street'] is not None and \
                    member_info['DistrictAddress']['Street'].strip():
                district_address_info = {
                    k: v.strip() for k, v
                    in dict(member_info['DistrictAddress']).items()
                    if k in ['Street', 'City', 'State', 'Zip']
                }
                district_address = '{Street}\n{City}, {State} {Zip}'.format(
                        **district_address_info)
                legislator.add_contact_detail(
                    type='address', value=district_address, note='District Address')
            else:
                self.warning('Could not find full district address for {}'.format(full_name))

            district_contact_info = self.clean_list([
                member_info['DistrictAddress'][x] for x in [
                    'Email', 'Phone', 'Fax'
                ]
            ])

            # Same issue with district email. See above comment
            if district_contact_info[0] and '@' not in district_contact_info[0]:
                district_contact_info[0] = None

            if district_contact_info[0]:
                # Site was hacked in the past
                assert '*****@*****.**' not in district_contact_info[0]

            if district_contact_info[1]:
                legislator.add_contact_detail(
                    type='voice', value=district_contact_info[1], note='District Address')
            if district_contact_info[2]:
                legislator.add_contact_detail(
                    type='fax', value=district_contact_info[2], note='District Address')
            if district_contact_info[0]:
                legislator.add_contact_detail(
                    type='email', value=district_contact_info[0], note='District Address')

            legislator.add_link(url)
            legislator.add_source(self.ssource)
            legislator.add_source(HOMEPAGE_URLS[chamber].format(
                **{"code": guid, "sid": sid}))

            yield legislator

Пример #18

Показать файл

    def scrape(self):
        noncommittees = {'Committee of the Whole'}
        committee_d = {}

        people_d = {}

        # Go to memberlist
        extra_args = {'ctl00$ContentPlaceHolder$lstName': 'City Council'}

        for councilman, committees in self.councilMembers(
                extra_args=extra_args):

            if 'url' in councilman['Person Name']:
                councilman_url = councilman['Person Name']['url']

                if councilman_url in people_d:
                    people_d[councilman_url][0].append(councilman)
                else:
                    people_d[councilman_url] = [councilman], committees

        for person_entries, committees in people_d.values():

            councilman = person_entries[-1]

            p = Person(councilman['Person Name']['label'])

            if p.name == 'Letitia James':
                p.name = 'Letitia Ms. James'
                p.add_name('Letitia James')

            spans = [(self.toTime(entry['Start Date']).date(),
                      self.toTime(entry['End Date']).date(), entry['District'])
                     for entry in person_entries]

            merged_spans = []
            last_end_date = None
            last_district = None
            for start_date, end_date, district in sorted(spans):
                if last_end_date is None:
                    span = [start_date, end_date, district]
                elif (start_date - last_end_date
                      ) == datetime.timedelta(1) and district == last_district:
                    span[1] = end_date
                else:
                    merged_spans.append(span)
                    span = [start_date, end_date, district]

                last_end_date = end_date
                last_district = district

            merged_spans.append(span)

            for start_date, end_date, district in merged_spans:
                district = councilman['District'].replace(' 0', ' ')
                if end_date == datetime.date(2017, 12, 31):
                    end_date = ''
                else:
                    end_date = end_date.isoformat()
                print(start_date, end_date)
                p.add_term('Council Member',
                           'legislature',
                           district=district,
                           start_date=start_date.isoformat(),
                           end_date=end_date)

            party = councilman['Political Party']
            if party == 'Democrat':
                party = 'Democratic'

            if party:
                p.add_party(party)

            if councilman['Photo']:
                p.image = councilman['Photo']

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['url'],
                                     note='E-mail')

            if councilman['Web site']:
                p.add_link(councilman['Web site']['url'], note='web site')

            p.extras = {'Notes': councilman['Notes']}

            p.add_source(councilman['Person Name']['url'], note='web')

            for committee, _, _ in committees:
                committee_name = committee['Department Name']['label']
                if committee_name not in noncommittees and 'committee' in committee_name.lower(
                ):
                    o = committee_d.get(committee_name, None)
                    if o is None:
                        parent_id = PARENT_ORGS.get(committee_name,
                                                    'New York City Council')
                        o = Organization(committee_name,
                                         classification='committee',
                                         parent_id={'name': parent_id})
                        o.add_source(committee['Department Name']['url'])
                        committee_d[committee_name] = o

                    membership = o.add_member(p, role=committee["Title"])
                    membership.start_date = self.mdY2Ymd(
                        committee["Start Date"])
            yield p

        for o in committee_d.values():
            if 'Committee' in o.name:
                yield o

        for o in committee_d.values():
            if 'Subcommittee' in o.name:
                yield o

        o = Organization(
            'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services',
            classification='committee',
            parent_id={'name': 'New York City Council'})
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o

        o = Organization(
            'Subcommittee on Drug Abuse',
            classification='committee',
            parent_id={
                'name':
                'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services'
            })
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o

Пример #19

Показать файл

    def scrape_session(self, session, chambers):
        sid = SESSION_SITE_IDS[session]
        members = backoff(self.sservice.GetMembersBySession,
                          sid)['MemberListing']

        seen_guids = []
        for member in members:
            guid = member['Id']
            member_info = backoff(self.sservice.GetMember, guid)

            # If a member switches chambers during the session, they may
            # appear twice. Skip the duplicate record accordingly.
            if guid in seen_guids:
                self.warning('Skipping duplicate record of {}'.format(
                    member_info['Name']['Last']))
                continue
            else:
                seen_guids.append(guid)

            # Check to see if the member has vacated; skip if so.
            # A member can have multiple services for a given session,
            # if they switched chambers. Filter these down to just the
            # active service.
            try:
                (legislative_service, ) = [
                    service for service in member_info['SessionsInService']
                    ['LegislativeService'] if service['Session']['Id'] == sid
                    and service['DateVacated'] is None
                ]
            except ValueError:
                self.info('Skipping retired member {}'.format(
                    member_info['Name']['Last']))
                continue

            nick_name, first_name, middle_name, last_name = (
                member_info['Name'][x]
                for x in ['Nickname', 'First', 'Middle', 'Last'])

            first_name = nick_name if nick_name else first_name

            if middle_name:
                full_name = "%s %s %s" % (first_name, middle_name, last_name)
            else:
                full_name = "%s %s" % (first_name, last_name)

            party = legislative_service['Party']

            if party == 'Democrat':
                party = 'Democratic'

            elif party.strip() == '':
                party = 'other'

            chamber, district = (legislative_service['District'][x]
                                 for x in ['Type', 'Number'])

            chamber = {"House": 'lower', "Senate": 'upper'}[chamber]

            url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber], {
                "code": guid,
                "sid": sid
            })

            legislator = Person(
                name=full_name,
                district=str(district),
                party=party,
                primary_org=chamber,
                image=photo,
            )
            legislator.extras = {
                'last_name': last_name,
                'first_name': first_name,
                'guid': guid,
            }

            if member_info['Address']['Street'] is not None and \
                    member_info['Address']['Street'].strip():
                capitol_address_info = {
                    k: v.strip()
                    for k, v in dict(member_info['Address']).items()
                    if k in ['Street', 'City', 'State', 'Zip']
                }
                capitol_address = '{Street}\n{City}, {State} {Zip}'.format(
                    **capitol_address_info)
                legislator.add_contact_detail(type='address',
                                              value=capitol_address,
                                              note='Capitol Address')
            else:
                self.warning(
                    'Could not find full capitol address for {}'.format(
                        full_name))

            capitol_contact_info = self.clean_list(
                [member_info['Address'][x] for x in ['Email', 'Phone', 'Fax']])

            # Sometimes email is set to a long cryptic string.
            # If it doesn't have a @ character, simply set it to None
            # examples:
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=xT8jBs5X4S7ZX2TOajTx2W7CBprTaVlpcvUvHEv78GI=
            # 01X5dvct3G1lV6RQ7I9o926Q==&c=eSH9vpfdy3XJ989Gpw4MOdUa3n55NTA8ev58RPJuzA8=
            if capitol_contact_info[0] and '@' not in capitol_contact_info[0]:
                capitol_contact_info[0] = None

            if capitol_contact_info[0]:
                # Site was hacked in the past
                assert '*****@*****.**' not in capitol_contact_info[0]

            if capitol_contact_info[1]:
                legislator.add_contact_detail(type='voice',
                                              value=capitol_contact_info[1],
                                              note='Capitol Address')
            if capitol_contact_info[2]:
                legislator.add_contact_detail(type='fax',
                                              value=capitol_contact_info[2],
                                              note='Capitol Address')
            if capitol_contact_info[0]:
                legislator.add_contact_detail(type='email',
                                              value=capitol_contact_info[0],
                                              note='Capitol Address')

            if member_info['DistrictAddress']['Street'] is not None and \
                    member_info['DistrictAddress']['Street'].strip():
                district_address_info = {
                    k: v.strip()
                    for k, v in dict(member_info['DistrictAddress']).items()
                    if k in ['Street', 'City', 'State', 'Zip']
                }
                district_address = '{Street}\n{City}, {State} {Zip}'.format(
                    **district_address_info)
                legislator.add_contact_detail(type='address',
                                              value=district_address,
                                              note='District Address')
            else:
                self.warning(
                    'Could not find full district address for {}'.format(
                        full_name))

            district_contact_info = self.clean_list([
                member_info['DistrictAddress'][x]
                for x in ['Email', 'Phone', 'Fax']
            ])

            # Same issue with district email. See above comment
            if district_contact_info[0] and '@' not in district_contact_info[0]:
                district_contact_info[0] = None

            if district_contact_info[0]:
                # Site was hacked in the past
                assert '*****@*****.**' not in district_contact_info[0]

            if district_contact_info[1]:
                legislator.add_contact_detail(type='voice',
                                              value=district_contact_info[1],
                                              note='District Address')
            if district_contact_info[2]:
                legislator.add_contact_detail(type='fax',
                                              value=district_contact_info[2],
                                              note='District Address')
            if district_contact_info[0]:
                legislator.add_contact_detail(type='email',
                                              value=district_contact_info[0],
                                              note='District Address')

            legislator.add_link(url)
            legislator.add_source(self.ssource)
            legislator.add_source(HOMEPAGE_URLS[chamber].format(**{
                "code": guid,
                "sid": sid
            }))

            yield legislator