示例#1
0
def test_person_add_party():
    p = Person('Groot')
    p.add_party('Green')
    p._related[0].validate()
    assert get_pseudo_id(p._related[0].organization_id) == {
        'name': 'Green',
        'classification': 'party'
    }
示例#2
0
    def scrape(self):
        current_path = Path(__file__)
        legislator_path = current_path.parent / 'congress-legislators/legislators-historical.yaml'

        with legislator_path.open() as f:
            legislators = yaml.load(f, Loader=yaml.CLoader)

        for legislator in legislators:
            if all(term['end'] < '1970' for term in legislator['terms']):
                continue

            l = Person(name=' '.join(
                (legislator['name']['first'], legislator['name']['last'])),
                       birth_date=legislator['bio'].get('birthday', ''),
                       gender=legislator['bio']['gender'])

            parties = set()
            for term in legislator['terms']:
                state = term['state']
                parties.add(term['party'])

                if term['type'] == 'rep':
                    role = 'Representative'
                    district_name = self._district_name(
                        state, term['district'])
                    chamber = 'lower'
                else:
                    role = "Senator"
                    district_name = "{state}, Class {klass}".format(
                        state=state, klass=term['class'])
                    chamber = 'upper'

                l.add_term(role,
                           chamber,
                           district=district_name,
                           start_date=term['start'],
                           end_date=term['end'])

            for party in parties:
                l.add_party(party)

            for scheme, identifier in legislator['id'].items():
                l.add_identifier(str(identifier), scheme=scheme)

            l.add_source(
                'https://github.com/unitedstates/congress-legislators/blob/master/legislators-historical.yaml'
            )

            yield l
示例#3
0
    def scrape(self):
        noncommittees = {'Committee of the Whole'}
        committee_d = {}

        people_d = {}

        # Go to memberlist
        extra_args = {'ctl00$ContentPlaceHolder$lstName': 'City Council'}

        for councilman, committees in self.councilMembers(
                extra_args=extra_args):

            if 'url' in councilman['Person Name']:
                councilman_url = councilman['Person Name']['url']

                if councilman_url in people_d:
                    people_d[councilman_url][0].append(councilman)
                else:
                    people_d[councilman_url] = [councilman], committees

        for person_entries, committees in people_d.values():

            councilman = person_entries[-1]

            p = Person(councilman['Person Name']['label'])

            if p.name == 'Letitia James':
                p.name = 'Letitia Ms. James'
                p.add_name('Letitia James')

            spans = [(self.toTime(entry['Start Date']).date(),
                      self.toTime(entry['End Date']).date(), entry['District'])
                     for entry in person_entries]

            merged_spans = []
            last_end_date = None
            last_district = None
            for start_date, end_date, district in sorted(spans):
                if last_end_date is None:
                    span = [start_date, end_date, district]
                elif (start_date - last_end_date
                      ) == datetime.timedelta(1) and district == last_district:
                    span[1] = end_date
                else:
                    merged_spans.append(span)
                    span = [start_date, end_date, district]

                last_end_date = end_date
                last_district = district

            merged_spans.append(span)

            for start_date, end_date, district in merged_spans:
                district = councilman['District'].replace(' 0', ' ')
                if end_date == datetime.date(2017, 12, 31):
                    end_date = ''
                else:
                    end_date = end_date.isoformat()
                print(start_date, end_date)
                p.add_term('Council Member',
                           'legislature',
                           district=district,
                           start_date=start_date.isoformat(),
                           end_date=end_date)

            party = councilman['Political Party']
            if party == 'Democrat':
                party = 'Democratic'

            if party:
                p.add_party(party)

            if councilman['Photo']:
                p.image = councilman['Photo']

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['url'],
                                     note='E-mail')

            if councilman['Web site']:
                p.add_link(councilman['Web site']['url'], note='web site')

            p.extras = {'Notes': councilman['Notes']}

            p.add_source(councilman['Person Name']['url'], note='web')

            for committee, _, _ in committees:
                committee_name = committee['Department Name']['label']
                if committee_name not in noncommittees and 'committee' in committee_name.lower(
                ):
                    o = committee_d.get(committee_name, None)
                    if o is None:
                        parent_id = PARENT_ORGS.get(committee_name,
                                                    'New York City Council')
                        o = Organization(committee_name,
                                         classification='committee',
                                         parent_id={'name': parent_id})
                        o.add_source(committee['Department Name']['url'])
                        committee_d[committee_name] = o

                    membership = o.add_member(p, role=committee["Title"])
                    membership.start_date = self.mdY2Ymd(
                        committee["Start Date"])
            yield p

        for o in committee_d.values():
            if 'Committee' in o.name:
                yield o

        for o in committee_d.values():
            if 'Subcommittee' in o.name:
                yield o

        o = Organization(
            'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services',
            classification='committee',
            parent_id={'name': 'New York City Council'})
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o

        o = Organization(
            'Subcommittee on Drug Abuse',
            classification='committee',
            parent_id={
                'name':
                'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services'
            })
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o
示例#4
0
    def scrape(self):
        # chambers = [chamber] if chamber is not None else ['upper', 'lower']
        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        page = self.get(leg_url)

        committees = {}

        # Ensure that the spreadsheet's structure hasn't generally changed
        _row_headers = page.text.split("\r\n")[0].replace('"', "").split(",")
        assert _row_headers == HEADERS, "Spreadsheet structure may have changed"

        page = open_csv(page)
        for row in page:

            chamber = {"H": "lower", "S": "upper"}[row["office code"]]

            district = row["dist"].lstrip("0")
            assert district.isdigit(), "Invalid district found: {}".format(district)

            name = row["first name"]
            mid = row["middle initial"].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row["last name"]
            suffix = row["suffix"].strip()
            if suffix:
                name += " %s" % suffix

            party = row["party"]
            if party == "Democrat":
                party = "Democratic"

            leg = Person(primary_org=chamber, name=name, district=district, party=party)

            legislator_url = row["URL"].replace("\\", "//").strip()
            if legislator_url != "":
                if not legislator_url.startswith("http"):
                    legislator_url = "http://"
                leg.add_link(legislator_url)

            leg.add_party(party=party)

            office_address = "%s\nRoom %s\nHartford, CT 06106" % (
                row["capitol street address"],
                row["room number"],
            )
            # extra_office_fields = dict()
            email = row["email"].strip()
            if "@" not in email:
                if not email:
                    email = None
                elif email.startswith("http://") or email.startswith("https://"):
                    # extra_office_fields['contact_form'] = email
                    email = None
                else:
                    raise ValueError("Problematic email found: {}".format(email))
            leg.add_contact_detail(
                type="address", value=office_address, note="Capitol Office"
            )
            leg.add_contact_detail(
                type="voice", value=row["capitol phone"], note="Capitol Office"
            )
            if email:
                leg.add_contact_detail(type="email", value=email)

            home_address = "{}\n{}, {} {}".format(
                row["home street address"],
                row["home city"],
                row["home state"],
                row["home zip code"],
            )
            if "Legislative Office Building" not in home_address:
                leg.add_contact_detail(
                    type="address", value=home_address, note="District Office"
                )
                if row["home phone"].strip():
                    leg.add_contact_detail(
                        type="voice", value=row["home phone"], note="District Office"
                    )
            leg.add_source(leg_url)

            for comm_name in row["committee member1"].split(";"):
                if " (" in comm_name:
                    comm_name, role = comm_name.split(" (")
                    role = role.strip(")").lower()
                else:
                    role = "member"
                comm_name = comm_name.strip()
                if comm_name:
                    if comm_name in committees:
                        com = committees[comm_name]
                    else:
                        com = Organization(
                            comm_name, classification="committee", chamber=chamber
                        )
                        com.add_source(leg_url)
                        committees[comm_name] = com
                        yield com

                    leg.add_membership(name_or_org=com, role=role)

            yield leg
示例#5
0
    def scrape(self):
        # chambers = [chamber] if chamber is not None else ['upper', 'lower']
        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        page = self.get(leg_url)

        committees = {}

        # Ensure that the spreadsheet's structure hasn't generally changed
        _row_headers = page.text.split('\r\n')[0].replace('"', '').split(',')
        assert _row_headers == HEADERS, "Spreadsheet structure may have changed"

        page = open_csv(page)
        for row in page:

            chamber = {'H': 'lower', 'S': 'upper'}[row['office code']]

            district = row['dist'].lstrip('0')
            assert district.isdigit(), "Invalid district found: {}".format(
                district)

            name = row['first name']
            mid = row['middle initial'].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row['last name']
            suffix = row['suffix'].strip()
            if suffix:
                name += " %s" % suffix

            party = row['party']
            if party == 'Democrat':
                party = 'Democratic'

            leg = Person(primary_org=chamber,
                         name=name,
                         district=district,
                         party=party)

            legislator_url = row['URL'].replace('\\', '//').strip()
            if legislator_url != '':
                if not legislator_url.startswith('http'):
                    legislator_url = 'http://'
                leg.add_link(legislator_url)

            leg.add_party(party=party)

            office_address = "%s\nRoom %s\nHartford, CT 06106" % (
                row['capitol street address'], row['room number'])
            # extra_office_fields = dict()
            email = row['email'].strip()
            if "@" not in email:
                if not email:
                    email = None
                elif email.startswith('http://') or email.startswith(
                        'https://'):
                    # extra_office_fields['contact_form'] = email
                    email = None
                else:
                    raise ValueError(
                        "Problematic email found: {}".format(email))
            leg.add_contact_detail(type='address',
                                   value=office_address,
                                   note='Capitol Office')
            leg.add_contact_detail(type='voice',
                                   value=row['capitol phone'],
                                   note='Capitol Office')
            if email:
                leg.add_contact_detail(type='email', value=email)

            home_address = "{}\n{}, {} {}".format(
                row['home street address'],
                row['home city'],
                row['home state'],
                row['home zip code'],
            )
            if "Legislative Office Building" not in home_address:
                leg.add_contact_detail(type='address',
                                       value=home_address,
                                       note='District Office')
                if row['home phone'].strip():
                    leg.add_contact_detail(type='voice',
                                           value=row['home phone'],
                                           note='District Office')
            leg.add_source(leg_url)

            for comm_name in row['committee member1'].split(';'):
                if ' (' in comm_name:
                    comm_name, role = comm_name.split(' (')
                    role = role.strip(')').lower()
                else:
                    role = 'member'
                comm_name = comm_name.strip()
                if comm_name:
                    if comm_name in committees:
                        com = committees[comm_name]
                    else:
                        com = Organization(comm_name,
                                           classification='committee',
                                           chamber=chamber)
                        com.add_source(leg_url)
                        committees[comm_name] = com
                        yield com

                    leg.add_membership(name_or_org=com, role=role)

            yield leg
示例#6
0
    def scrape_chamber(self, chamber):
        body = {'lower': 'H', 'upper': 'S'}[chamber]
        url = 'http://www.azleg.gov/MemberRoster/?body=' + body
        page = self.get(url).text

        # there is a bad comment closing tag on this page
        page = page.replace('--!>', '-->')

        root = html.fromstring(page)

        path = '//table//tr'
        roster = root.xpath(path)[1:]
        for row in roster:
            position = ''
            name, district, party, email, room, phone, = row.xpath('td')

            if email.attrib.get('class') == 'vacantmember':
                continue  # Skip any vacant members.

            link = name.xpath('string(a/@href)')
            if len(name) == 1:
                name = name.text_content().strip()
            else:
                position = name.tail.strip()
                name = name[0].text_content().strip()
            if '--' in name:
                name = name.split('--')[0].strip()

            linkpage = self.get(link).text
            linkpage = linkpage.replace('--!>', '-->')
            linkroot = html.fromstring(linkpage)
            linkroot.make_links_absolute(link)

            photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]")

            if len(photos) != 1:
                self.warning('no photo on ' + link)
                photo_url = ''
            else:
                photo_url = photos[0].attrib['src']

            district = district.text_content().strip()
            party = party.text_content().strip()
            email = email.text_content().strip()

            if email.startswith('Email: '):
                email = email.replace('Email: ', '').lower() + '@azleg.gov'
            else:
                email = ''

            party = self.get_party(party)
            room = room.text_content().strip()
            if chamber == 'lower':
                address = "House of Representatives\n"
            else:
                address = "Senate\n"
            address = address + "1700 West Washington\n Room " + room  \
                              + "\nPhoenix, AZ 85007"

            phone = phone.text_content().strip()
            if '602' not in re.findall(r'(\d+)', phone):
                phone = "602-" + phone

            leg = Person(primary_org=chamber, image=photo_url, name=name, district=district,
                         party=party)
            leg.add_contact_detail(type='address', value=address, note='Capitol Office')
            leg.add_contact_detail(type='voice', value=phone, note='Capitol Office')
            leg.add_party(party=party)
            leg.add_link(link)

            if email:
                leg.add_contact_detail(type='email', value=email)
            if position:
                leg.add_membership(name_or_org=party, role=position)
                # leg.add_role(position, term, chamber=chamber,
                #             district=district, party=party)

            leg.add_source(url)

            # Probably just get this from the committee scraper
            # self.scrape_member_page(link, session, chamber, leg)
            yield leg
示例#7
0
    def scrape(self):
        # chambers = [chamber] if chamber is not None else ['upper', 'lower']
        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        page = self.get(leg_url)

        committees = {}

        # Ensure that the spreadsheet's structure hasn't generally changed
        _row_headers = page.text.split('\r\n')[0].replace('"', '').split(',')
        assert _row_headers == HEADERS, "Spreadsheet structure may have changed"

        page = open_csv(page)
        for row in page:

            chamber = {'H': 'lower', 'S': 'upper'}[row['office code']]

            district = row['dist'].lstrip('0')
            assert district.isdigit(), "Invalid district found: {}".format(district)

            name = row['first name']
            mid = row['middle initial'].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row['last name']
            suffix = row['suffix'].strip()
            if suffix:
                name += " %s" % suffix

            party = row['party']
            if party == 'Democrat':
                party = 'Democratic'

            leg = Person(primary_org=chamber,
                         name=name,
                         district=district,
                         party=party
                         )

            legislator_url = row['URL'].replace('\\', '//').strip()
            if legislator_url != '':
                if not legislator_url.startswith('http'):
                    legislator_url = 'http://'
                leg.add_link(legislator_url)

            leg.add_party(party=party)

            office_address = "%s\nRoom %s\nHartford, CT 06106" % (
                row['capitol street address'], row['room number'])
            # extra_office_fields = dict()
            email = row['email'].strip()
            if "@" not in email:
                if not email:
                    email = None
                elif email.startswith('http://') or email.startswith('https://'):
                    # extra_office_fields['contact_form'] = email
                    email = None
                else:
                    raise ValueError("Problematic email found: {}".format(email))
            leg.add_contact_detail(type='address', value=office_address, note='Capitol Office')
            leg.add_contact_detail(type='voice', value=row['capitol phone'], note='Capitol Office')
            if email:
                leg.add_contact_detail(type='email', value=email)

            home_address = "{}\n{}, {} {}".format(
                row['home street address'],
                row['home city'],
                row['home state'],
                row['home zip code'],
            )
            if "Legislative Office Building" not in home_address:
                leg.add_contact_detail(type='address', value=home_address, note='District Office')
                if row['home phone'].strip():
                    leg.add_contact_detail(type='voice',
                                           value=row['home phone'],
                                           note='District Office')
            leg.add_source(leg_url)

            for comm_name in row['committee member1'].split(';'):
                if ' (' in comm_name:
                    comm_name, role = comm_name.split(' (')
                    role = role.strip(')').lower()
                else:
                    role = 'member'
                comm_name = comm_name.strip()
                if comm_name:
                    if comm_name in committees:
                        com = committees[comm_name]
                    else:
                        com = Organization(comm_name, classification='committee', chamber=chamber)
                        com.add_source(leg_url)
                        committees[comm_name] = com
                        yield com

                    leg.add_membership(name_or_org=com, role=role)

            yield leg
示例#8
0
    def scrape(self):
        web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute)
        web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody'

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}

        for member, _ in web_scraper.councilMembers():
            name = member['Person Name']['label'].strip()
            web_info[name] = member

        city_council, = [body for body in self.bodies()
                         if body['BodyName'] == 'City Council']

        terms = collections.defaultdict(list)

        public_advocates = {  # Match casing to Bill De Blasio as council member
            'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio',
            'The Public Advocate (Ms. James)': 'Letitia James',
        }

        for office in self.body_offices(city_council):
            name = office['OfficeRecordFullName']
            name = public_advocates.get(name, name).strip()

            terms[name].append(office)

            # Add past members (and advocates public)
            if name not in web_info:
                web_info[name] = collections.defaultdict(lambda: None)

        # Check that we have everyone we expect, formatted consistently, in
        # both information arrays. For instance, this will fail if we forget to
        # strip trailing spaces from names on one side or the other (which has
        # the effect of omitting information, such as post, from the scrape).

        assert set(web_info.keys()) == set(terms.keys())

        members = {}

        for member, offices in terms.items():

            p = Person(member)

            web = web_info[member]

            for term in offices:
                role = term['OfficeRecordTitle']

                if role == 'Public Advocate':
                    role = 'Non-Voting Council Member'
                else:
                    role = 'Council Member'

                district = web.get('District', '').replace(' 0', ' ')

                p.add_term(role,
                           'legislature',
                           district=district,
                           start_date=self.toDate(term['OfficeRecordStartDate']),
                           end_date=self.toDate(term['OfficeRecordEndDate']))

                party = web.get('Political Party')

                if party == 'Democrat':
                    party = 'Democratic'

                if party:
                    p.add_party(party)

                if web.get('Photo'):
                    p.image = web['Photo']

                contact_types = {
                    "City Hall Office": ("address", "City Hall Office"),
                    "City Hall Phone": ("voice", "City Hall Phone"),
                    "Ward Office Phone": ("voice", "Ward Office Phone"),
                    "Ward Office Address": ("address", "Ward Office Address"),
                    "Fax": ("fax", "Fax")
                }

                for contact_type, (type_, _note) in contact_types.items():
                    if web.get(contact_type) and web(contact_type) != 'N/A':
                        p.add_contact_detail(type=type_,
                                             value= web[contact_type],
                                             note=_note)

                if web.get('E-mail'):
                    p.add_contact_detail(type="email",
                                         value=web['E-mail']['url'],
                                         note='E-mail')

                if web.get('Web site'):
                    p.add_link(web['Web site']['url'], note='web site')

                if web.get('Notes'):
                    p.extras = {'Notes': web['Notes']}

                if not p.sources:  # Only add sources once
                    source_urls = self.person_sources_from_office(term)
                    person_api_url, person_web_url = source_urls
                    p.add_source(person_api_url, note='api')
                    p.add_source(person_web_url, note='web')

            members[member] = p

        committee_types = ['Committee',
                           'Inactive Committee',
                           'Select Committee',
                           'Subcommittee',
                           'Task Force',
                           'Land Use', # Committee on Land Use
                          ]

        body_types = {k: v for k, v in self.body_types().items()
                      if k in committee_types}

        for body in self.bodies():
            if body['BodyTypeName'] in body_types \
                or body['BodyName'] in ('Legislative Documents Unit',
                                        'Legal and Government Affairs Division'):

                # Skip typo in API data
                if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services':
                    continue

                parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council')

                body_name = body['BodyName']

                o = Organization(body_name,
                                 classification='committee',
                                 parent_id={'name': parent_org})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web')

                for office in self.body_offices(body):
                    # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio',
                    # 'Committee Member', None, 'CHAIRPERSON'

                    role = office['OfficeRecordTitle']

                    if role and role.lower() == 'chairperson':
                        role = 'Chairperson'
                    else:
                        role = 'Member'

                    person = office['OfficeRecordFullName']
                    person = public_advocates.get(person, person).strip()

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(o,
                                     role=role,
                                     start_date=self.toDate(office['OfficeRecordStartDate']),
                                     end_date=self.toDate(office['OfficeRecordEndDate']))

                yield o

        for p in members.values():
            yield p
示例#9
0
    def scrape(self):
        web_scraper = LegistarPersonScraper(
            requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody'

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}

        for member, _ in web_scraper.councilMembers():
            name = member['Person Name']['label'].strip()
            web_info[name] = member

        city_council, = [
            body for body in self.bodies()
            if body['BodyName'] == 'City Council'
        ]

        terms = collections.defaultdict(list)

        public_advocates = {  # Match casing to Bill De Blasio as council member
            'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio',
            'The Public Advocate (Ms. James)': 'Letitia James',
        }

        for office in self.body_offices(city_council):
            name = office['OfficeRecordFullName']
            name = public_advocates.get(name, name).strip()

            terms[name].append(office)

            # Add past members (and advocates public)
            if name not in web_info:
                web_info[name] = collections.defaultdict(lambda: None)

        # Check that we have everyone we expect, formatted consistently, in
        # both information arrays. For instance, this will fail if we forget to
        # strip trailing spaces from names on one side or the other (which has
        # the effect of omitting information, such as post, from the scrape).

        assert set(web_info.keys()) == set(terms.keys())

        members = {}

        for member, offices in terms.items():

            p = Person(member)

            web = web_info[member]

            for term in offices:
                role = term['OfficeRecordTitle']

                if role == 'Public Advocate':
                    role = 'Non-Voting Council Member'
                else:
                    role = 'Council Member'

                district = web.get('District', '').replace(' 0', ' ')

                p.add_term(role,
                           'legislature',
                           district=district,
                           start_date=self.toDate(
                               term['OfficeRecordStartDate']),
                           end_date=self.toDate(term['OfficeRecordEndDate']))

                party = web.get('Political Party')

                if party == 'Democrat':
                    party = 'Democratic'

                if party:
                    p.add_party(party)

                if web.get('Photo'):
                    p.image = web['Photo']

                contact_types = {
                    "City Hall Office": ("address", "City Hall Office"),
                    "City Hall Phone": ("voice", "City Hall Phone"),
                    "Ward Office Phone": ("voice", "Ward Office Phone"),
                    "Ward Office Address": ("address", "Ward Office Address"),
                    "Fax": ("fax", "Fax")
                }

                for contact_type, (type_, _note) in contact_types.items():
                    if web.get(contact_type) and web(contact_type) != 'N/A':
                        p.add_contact_detail(type=type_,
                                             value=web[contact_type],
                                             note=_note)

                if web.get('E-mail'):
                    p.add_contact_detail(type="email",
                                         value=web['E-mail']['url'],
                                         note='E-mail')

                if web.get('Web site'):
                    p.add_link(web['Web site']['url'], note='web site')

                if web.get('Notes'):
                    p.extras = {'Notes': web['Notes']}

                if not p.sources:  # Only add sources once
                    source_urls = self.person_sources_from_office(term)
                    person_api_url, person_web_url = source_urls
                    p.add_source(person_api_url, note='api')
                    p.add_source(person_web_url, note='web')

            members[member] = p

        committee_types = [
            'Committee', 'Inactive Committee', 'Select Committee',
            'Subcommittee', 'Task Force', 'Land Use'
        ]  # Committee on Land Use

        body_types = {
            k: v
            for k, v in self.body_types().items() if k in committee_types
        }

        for body in self.bodies():
            if body['BodyTypeName'] in body_types \
                or body['BodyName'] in ('Legislative Documents Unit',
                                        'Legal and Government Affairs Division'):

                # Skip typo in API data
                if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services':
                    continue

                parent_org = PARENT_ORGS.get(body['BodyName'],
                                             'New York City Council')

                body_name = body['BodyName']

                o = Organization(body_name,
                                 classification='committee',
                                 parent_id={'name': parent_org})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(
                    self.WEB_URL +
                    '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.
                    format(**body),
                    note='web')

                for office in self.body_offices(body):
                    # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio',
                    # 'Committee Member', None, 'CHAIRPERSON'

                    role = office['OfficeRecordTitle']

                    if role and role.lower() == 'chairperson':
                        role = 'Chairperson'
                    else:
                        role = 'Member'

                    person = office['OfficeRecordFullName']
                    person = public_advocates.get(person, person).strip()

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(o,
                                     role=role,
                                     start_date=self.toDate(
                                         office['OfficeRecordStartDate']),
                                     end_date=self.toDate(
                                         office['OfficeRecordEndDate']))

                yield o

        for p in members.values():
            yield p
    def scrape(self):
        noncommittees = {'Committee of the Whole'}
        committee_d = {}

        people_d = {}

        for councilman, committees in self.councilMembers() :

            
            if 'url' in councilman['Person Name'] :
                councilman_url = councilman['Person Name']['url']

                if councilman_url in people_d :
                    people_d[councilman_url][0].append(councilman) 
                else :
                    people_d[councilman_url] = [councilman], committees

        for person_entries, committees in people_d.values() :

            councilman = person_entries[-1]
            
            p = Person(councilman['Person Name']['label'])
            
            if p.name == 'Letitia James' :
                p.name = 'Letitia Ms. James'
                p.add_name('Letitia James')

            spans = [(self.toTime(entry['Start Date']).date(), 
                      self.toTime(entry['End Date']).date(),
                      entry['District'])
                     for entry in person_entries]

            merged_spans = []
            last_end_date = None
            last_district = None
            for start_date, end_date, district in sorted(spans) :
                if last_end_date is None :
                    span = [start_date, end_date, district]
                elif (start_date - last_end_date) == datetime.timedelta(1) and district == last_district :
                    span[1] = end_date
                else :
                    merged_spans.append(span)
                    span = [start_date, end_date, district]

                last_end_date = end_date
                last_district = district

            merged_spans.append(span)

            for start_date, end_date, district in merged_spans :
                district = councilman['District'].replace(' 0', ' ')
                if end_date == datetime.date(2017, 12, 31) :
                    end_date = ''
                else :
                    end_date = end_date.isoformat()
                print(start_date, end_date)
                p.add_term('Council Member', 'legislature', 
                           district=district, 
                           start_date=start_date.isoformat(),
                           end_date=end_date)

            party = councilman['Political Party']
            if party == 'Democrat' :
                party = 'Democratic'
            
            if party :
                p.add_party(party)

            if councilman['Photo'] :
                p.image = councilman['Photo']

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['url'],
                                     note='E-mail')

            if councilman['Web site']:
                p.add_link(councilman['Web site']['url'], note='web site')

            p.extras = {'Notes' : councilman['Notes']}
                 
            p.add_source(councilman['Person Name']['url'], note='web')

            for committee, _, _ in committees:
                committee_name = committee['Department Name']['label']
                if committee_name not in noncommittees and 'committee' in committee_name.lower():
                    o = committee_d.get(committee_name, None)
                    if o is None:
                        parent_id = PARENT_ORGS.get(committee_name,
                                                    'New York City Council')
                        o = Organization(committee_name,
                                         classification='committee',
                                         parent_id={'name' : parent_id})
                        o.add_source(committee['Department Name']['url'])
                        committee_d[committee_name] = o

                    membership = o.add_member(p, role=committee["Title"])
                    membership.start_date = self.mdY2Ymd(committee["Start Date"])
            yield p
            

        for o in committee_d.values() :
            if 'Committee' in o.name :
                yield o

        for o in committee_d.values() :
            if 'Subcommittee' in o.name :
                yield o

        o = Organization('Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services',
                         classification='committee',
                         parent_id={'name' : 'New York City Council'})
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o

        o = Organization('Subcommittee on Drug Abuse',
                         classification='committee',
                         parent_id={'name' : 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services'})
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o
示例#11
0
    def scrape_chamber(self, chamber):
        body = {'lower': 'H', 'upper': 'S'}[chamber]
        url = 'http://www.azleg.gov/MemberRoster/?body=' + body
        page = self.get(url).text

        # there is a bad comment closing tag on this page
        page = page.replace('--!>', '-->')

        root = html.fromstring(page)

        path = '//table//tr'
        roster = root.xpath(path)[1:]
        for row in roster:
            position = ''
            name, district, party, email, room, phone, = row.xpath('td')

            if email.attrib.get('class') == 'vacantmember':
                continue  # Skip any vacant members.

            link = name.xpath('string(a/@href)')
            if len(name) == 1:
                name = name.text_content().strip()
            else:
                position = name.tail.strip()
                name = name[0].text_content().strip()
            if '--' in name:
                name = name.split('--')[0].strip()

            linkpage = self.get(link).text
            linkpage = linkpage.replace('--!>', '-->')
            linkroot = html.fromstring(linkpage)
            linkroot.make_links_absolute(link)

            photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]")

            if len(photos) != 1:
                self.warning('no photo on ' + link)
                photo_url = ''
            else:
                photo_url = photos[0].attrib['src']

            district = district.text_content()
            party = party.text_content().strip()
            email = email.text_content().strip()

            if email.startswith('Email: '):
                email = email.replace('Email: ', '').lower() + '@azleg.gov'
            else:
                email = ''

            party = self.get_party(party)
            room = room.text_content().strip()
            if chamber == 'lower':
                address = "House of Representatives\n"
            else:
                address = "Senate\n"
            address = address + "1700 West Washington\n Room " + room  \
                              + "\nPhoenix, AZ 85007"

            phone = phone.text_content().strip()
            if '602' not in re.findall(r'(\d+)', phone):
                phone = "602-" + phone

            leg = Person(primary_org=chamber,
                         image=photo_url,
                         name=name,
                         district=district,
                         party=party)
            leg.add_contact_detail(type='address',
                                   value=address,
                                   note='Capitol Office')
            leg.add_contact_detail(type='voice',
                                   value=phone,
                                   note='Capitol Office')
            leg.add_party(party=party)
            leg.add_link(link)

            if email:
                leg.add_contact_detail(type='email', value=email)
            if position:
                leg.add_membership(name_or_org=party, role=position)
                # leg.add_role(position, term, chamber=chamber,
                #             district=district, party=party)

            leg.add_source(url)

            # Probably just get this from the committee scraper
            # self.scrape_member_page(link, session, chamber, leg)
            yield leg
示例#12
0
    def scrape_chamber(self, chamber):
        body = {"lower": "H", "upper": "S"}[chamber]
        url = "http://www.azleg.gov/MemberRoster/?body=" + body
        page = self.get(url).text

        # there is a bad comment closing tag on this page
        page = page.replace("--!>", "-->")

        root = html.fromstring(page)

        path = "//table//tr"
        roster = root.xpath(path)[1:]
        for row in roster:
            position = ""
            name, district, party, email, room, phone, = row.xpath("td")

            if email.attrib.get("class") == "vacantmember":
                continue  # Skip any vacant members.

            link = name.xpath("string(a/@href)")
            if len(name) == 1:
                name = name.text_content().strip()
            else:
                position = name.tail.strip()
                name = name[0].text_content().strip()
            if "--" in name:
                name = name.split("--")[0].strip()

            linkpage = self.get(link).text
            linkpage = linkpage.replace("--!>", "-->")
            linkroot = html.fromstring(linkpage)
            linkroot.make_links_absolute(link)

            photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]")

            if len(photos) != 1:
                self.warning("no photo on " + link)
                photo_url = ""
            else:
                photo_url = photos[0].attrib["src"]

            district = district.text_content().strip()
            party = party.text_content().strip()
            email = email.text_content().strip()

            if email.startswith("Email: "):
                email = email.replace("Email: ", "").lower() + "@azleg.gov"
            else:
                email = ""

            party = self.get_party(party)
            room = room.text_content().strip()
            if chamber == "lower":
                address = "House of Representatives\n"
            else:
                address = "Senate\n"
            address = (address + "1700 West Washington\n Room " + room +
                       "\nPhoenix, AZ 85007")

            phone = phone.text_content().strip()
            if "602" not in re.findall(r"(\d+)", phone):
                phone = "602-" + phone

            leg = Person(
                primary_org=chamber,
                image=photo_url,
                name=name,
                district=district,
                party=party,
            )
            leg.add_contact_detail(type="address",
                                   value=address,
                                   note="Capitol Office")
            leg.add_contact_detail(type="voice",
                                   value=phone,
                                   note="Capitol Office")
            leg.add_party(party=party)
            leg.add_link(link)

            if email:
                leg.add_contact_detail(type="email", value=email)
            if position:
                leg.add_membership(name_or_org=party, role=position)
                # leg.add_role(position, term, chamber=chamber,
                #             district=district, party=party)

            leg.add_source(url)

            # Probably just get this from the committee scraper
            # self.scrape_member_page(link, session, chamber, leg)
            yield leg
def test_person_add_party():
    p = Person('Groot')
    p.add_party('Green')
    p._related[0].validate()
    assert get_pseudo_id(p._related[0].organization_id) == {
        'name': 'Green', 'classification': 'party'}