Пример #1
0
    def get_council(self):
        council_doc = self.lxmlize(self.COUNCIL_URL)

        member_urls = council_doc.xpath(
            '//table[@summary="City Directory"]/tr//' 'a[contains(@href, "/directory.aspx?EID=")]/@href'
        )
        for member_url in member_urls:
            member_doc = self.lxmlize(member_url)

            (name,) = member_doc.xpath('//span[@class="BioName"]/span/text()')
            (name,) = re.findall(r"^(?:Mr\.|Mrs\.|Hon\.)?\s*(.*?)\s*$", name)

            (title,) = member_doc.xpath('//a[@class="BioLink"]/following-sibling::text()')
            (title,) = re.findall(r"^Title: (Council Member(?: Ward \d)|Mayor)\s*$", title)

            try:
                (image_url,) = member_doc.xpath('//span[@class="BioText"]//img/@src')
            except ValueError:
                image_url = ""

            member = Person(name=name, image=image_url, primary_org="legislature", role=title)

            member.add_source(member_url)

            yield member
Пример #2
0
    def scrape_legislator(self, chamber, name, url):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        district = page.xpath('//h1[contains(., "DISTRICT")]/text()').pop() \
            .split()[1].strip().lstrip('0')

        party = page.xpath('//h2').pop().text_content()
        party = re.search(r'\((R|D|I)[ \-\]]', party).group(1)

        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'
        elif party == 'I':
            party = 'Independent'

        photo_url = page.xpath(
            "//img[contains(@src, 'images/members/')]")[0].attrib['src']

        leg = Person(name, district=district, party=party, image=photo_url, primary_org=chamber)
        leg.add_link(url)
        leg.add_source(url)
        self.scrape_offices(leg, page)

        yield leg
Пример #3
0
    def get_council(self):
        council_doc = self.lxmlize(self.COUNCIL_URL)

        member_urls = council_doc.xpath(
            '//table[@summary="City Directory"]/tr//'
            'a[contains(@href, "/directory.aspx?EID=")]/@href')
        for member_url in member_urls:
            member_doc = self.lxmlize(member_url)

            (name, ) = member_doc.xpath('//h1[@class="BioName"]/text()')
            (name, ) = re.findall(r'^(?:Mr\.|Mrs\.|Hon\.)?\s*(.*?)\s*$', name)

            # Returning everything into a list because the number of values returned varies 
            # depending on if the person has an email or not
            text_list = member_doc.xpath(
                '//a[@class="BioLink"]/parent::div/text()')
            title = text_list[1].strip()
            (title, ) = re.findall(
                r'^Title: (Council Member,?(?: Ward \d)|Mayor)\s*$', title)

            try:
                (image_url, ) = member_doc.xpath(
                    '//span[@class="BioText"]//img/@src')
            except ValueError:
                image_url = ''

            member = Person(name=name,
                            image=image_url,
                            primary_org='legislature',
                            role=title)

            member.add_source(member_url)

            yield member
Пример #4
0
    def scrape_upper_chamber(self, term):
        url = "http://oksenate.gov/Senators/Default.aspx"
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for a in doc.xpath('//table[@summary]')[0]. \
                xpath('.//td//a[contains(@href, "biographies")]'):
            tail = a.xpath('..')[0].tail
            if tail:
                district = tail.split()[1]
            else:
                district = a.xpath('../../span')[1].text.split()[1]

            if a.text is None or a.text.strip() == 'Vacant':
                self.warning("District {} appears to be empty".format(district))
                continue
            else:
                match = re.match(r'(.+) \(([A-Z])\)', a.text.strip())
                name, party = match.group(1), self._parties[match.group(2)]

            url = a.get('href')

            person = Person(primary_org='upper',
                            district=district,
                            name=name.strip(),
                            party=party,
                            )
            person.add_link(url)
            person.add_source(url)
            self.scrape_upper_offices(person, url)
            yield person
Пример #5
0
	def scrape_alderman(self, ward_num):
		ward_url = "{}/ward-{}".format(Utils.ALDERMEN_HOME, ward_num)
		alderman_url = self.alderman_url(ward_url)
		alderman_page = self.lxmlize(alderman_url)

		# person's name is the only <h1> tag on the page
		name = alderman_page.xpath("//h1/text()")[0]

		# initialize person object with appropriate data so that pupa can 
		# automatically create a membership object linking this person to
		# a post in the jurisdiction's "Board of Aldermen" organization
		district = "Ward {} Alderman".format(ward_num)
		person = Person(name=name, district=district, role="Alderman", 
										primary_org="legislature")

		# set additional fields
		person.image = alderman_page.xpath("//div/img/@src")[0]
		phone_number = alderman_page.xpath("//strong[text()='Phone:']/../text()")[1].strip()
		person.add_contact_detail(type="voice", value=phone_number)

		# add sources
		person.add_source(alderman_url, note="profile")
		person.add_source(ward_url, note="ward")

		return person
Пример #6
0
    def handle_list_item(self, item):
        link = item.xpath('.//div[@class="rep_style"]/a')[0]
        name = link.text_content().strip()

        if 'Vacant' in name or 'Resigned' in name or 'Pending' in name:
            return

        party = item.xpath('.//div[@class="party_style"]/text()')[0].strip()
        party = {'D': 'Democratic', 'R': 'Republican'}[party]

        district = item.xpath('.//div[@class="district_style"]/text()')[0].strip()

        leg_url = link.get('href')
        split_url = parse.urlsplit(leg_url)
        member_id = parse.parse_qs(split_url.query)['MemberId'][0]
        image = "http://www.flhouse.gov/FileStores/Web/Imaging/Member/{}.jpg".format(member_id)

        rep = Person(name=name, district=district, party=party, primary_org='lower',
                     role='Representative', image=image)
        rep.add_link(leg_url)
        rep.add_source(leg_url)
        rep.add_source(self.url)

        self.scrape_page(RepDetail, leg_url, obj=rep)

        return rep
Пример #7
0
    def scrape(self):
        urls = Urls(dict(list=legislators_url), self)

        council = Organization(
            'Temecula City Council',
            classification='legislature')
        council.add_source(urls.list.url)
        yield council

        for tr in urls.list.xpath('//table[2]//tr')[1:]:

            # Parse some attributes.
            name, role = tr.xpath('td/p[1]//font/text()')
            image = tr.xpath('td/img/@src').pop()

            # Create legislator.
            person = Person(name, image=image)

            # Add membership on council.
            memb = person.add_membership(council, role=role)

            # Add email address.
            email, detail_url = tr.xpath('td//a/@href')
            email = email[7:]
            memb.contact_details.append(
                dict(type='email', value=email, note='work'))

            # Add sources.
            person.add_source(urls.list.url)
            person.add_source(detail_url)

            yield person
    def bos_scrape_people(self):
        page = self.lxmlize(MEMBER_LIST)
        people = page.xpath(
            "//table[@width='100%']//td[@style='TEXT-ALIGN: center']")

        for person in people:
            image, name = [self.get_one(person, x) for x in [
                ".//img",
                ".//a[contains(@href, 'councillors') and (text()!='')]"
            ]]
            role = person.xpath(".//br")[0].tail.strip()
            image = image.attrib['src']  # Fallback if we don't get one from the
            # homepage.
            homepage = name.attrib['href']
            name = clean_name(name.text)
            info = self.scrape_homepage(homepage)
            if info.get('image', None):
                image = info['image']

            p = Person(name=name, district=role, image=image,
                       primary_org="legislature", biography=info['bio'])
            p.add_link(url=homepage, note='homepage')
            p.add_source(homepage)
            p.add_source(MEMBER_LIST)
            yield p
Пример #9
0
    def scrape_chamber(self, session):
        session_key = SESSION_KEYS[session]
        legislators_reponse = self.api_client.get('legislators', session=session_key)

        for legislator in legislators_reponse:
            url_name = legislator['WebSiteUrl'].split('/')[-1]
            chamber_name = 'house' if legislator['Chamber'] == 'H' else 'senate'
            img = 'https://www.oregonlegislature.gov/{}/MemberPhotos/{}.jpg'.format(
                chamber_name, url_name
            )

            party = legislator['Party']
            if party == 'Democrat':
                party = 'Democratic'

            person = Person(name='{} {}'.format(legislator['FirstName'], legislator['LastName']),
                            primary_org={'S': 'upper', 'H': 'lower'}[legislator['Chamber']],
                            party=party,
                            district=legislator['DistrictNumber'],
                            image=img)
            person.add_link(legislator['WebSiteUrl'])
            person.add_source(legislator['WebSiteUrl'])

            if legislator['CapitolAddress']:
                person.add_contact_detail(type='address', value=legislator['CapitolAddress'],
                                          note='Capitol Office')

            if legislator['CapitolPhone']:
                person.add_contact_detail(type='voice', value=legislator['CapitolPhone'],
                                          note='Capitol Office')

            person.add_contact_detail(type='email', value=legislator['EmailAddress'],
                                      note='Capitol Office')

            yield person
Пример #10
0
    def scrape_member(self, chamber, member_url):
        member_page = self.get(member_url).text
        doc = lxml.html.fromstring(member_page)

        photo_url = doc.xpath('//div[@id="bioImage"]/img/@src')[0]
        name_pieces = doc.xpath('//span[@id="name"]/text()')[0].split()
        full_name = ' '.join(name_pieces[1:-1]).strip()

        party = name_pieces[-1]
        if party == '(R)':
            party = 'Republican'
        elif party == '(D)':
            party = 'Democratic'
        elif party == '(I)':
            party = 'Independent'

        district = doc.xpath('//span[@id="districtHeader"]/text()')[0].split()[-1]

        person = Person(name=full_name, district=district, party=party,
                        primary_org=chamber, image=photo_url)
        person.add_source(member_url)
        person.add_link(member_url)

        address = '\n'.join(doc.xpath('//div[@id="FrankfortAddresses"]//'
                                      'span[@class="bioText"]/text()'))

        phone = None
        fax = None
        phone_numbers = doc.xpath('//div[@id="PhoneNumbers"]//span[@class="bioText"]/text()')
        for num in phone_numbers:
            if num.startswith('Annex: '):
                num = num.replace('Annex: ', '')
                if num.endswith(' (fax)'):
                    fax = num.replace(' (fax)', '')
                else:
                    phone = num

        emails = doc.xpath(
            '//div[@id="EmailAddresses"]//span[@class="bioText"]//a/text()'
        )
        email = reduce(
            lambda match, address: address if '@lrc.ky.gov' in str(address) else match,
            [None] + emails
        )

        if phone:
            person.add_contact_detail(type='voice', value=phone, note='Capitol Office')

        if fax:
            person.add_contact_detail(type='fax', value=fax, note='Capitol Office')

        if email:
            person.add_contact_detail(type='email', value=email, note='Capitol Office')

        if address.strip() == "":
            self.warning("Missing Capitol Office!!")
        else:
            person.add_contact_detail(type='address', value=address, note='Capitol Office')

        yield person
Пример #11
0
def test_full_person():
    person = ScrapePerson('Tom Sawyer')
    person.add_identifier('1')
    person.add_name('Tommy', start_date='1880')
    person.add_contact_detail(type='phone', value='555-555-1234', note='this is fake')
    person.add_link('http://example.com/link')
    person.add_source('http://example.com/source')

    # import person
    pd = person.as_dict()
    PersonImporter('jurisdiction-id').import_data([pd])

    # get person from db and assert it imported correctly
    p = Person.objects.get()
    assert 'ocd-person' in p.id
    assert p.name == person.name

    assert p.identifiers.all()[0].identifier == '1'
    assert p.identifiers.all()[0].scheme == ''

    assert p.other_names.all()[0].name == 'Tommy'
    assert p.other_names.all()[0].start_date == '1880'

    assert p.contact_details.all()[0].type == 'phone'
    assert p.contact_details.all()[0].value == '555-555-1234'
    assert p.contact_details.all()[0].note == 'this is fake'

    assert p.links.all()[0].url == 'http://example.com/link'
    assert p.sources.all()[0].url == 'http://example.com/source'
 def scrape_csv(self, reader):
     for row in reader:
         contributor = Person(
             name="{Contact First Name} {Contact Last Name}".format(**row)
         )
         contributor.add_source(SEARCH_URL)
         yield contributor
Пример #13
0
    def scrape_member(self, chamber, link):
        name = link.text.strip()
        leg_url = link.get('href')
        district = link.xpath("string(../../td[3])")
        party = link.xpath("string(../../td[4])")

        # we get email on the next page now
        # email = link.xpath("string(../../td[5])")

        if party == 'Democrat':
            party = 'Democratic'
        elif party == 'No Party Specified':
            party = 'Independent'

        pid = re.search(r"personID=(\d+)", link.attrib['href']).group(1)
        photo_url = ("https://www.legis.iowa.gov/photo"
                     "?action=getPhoto&ga=%s&pid=%s" % (self.latest_session(), pid))

        leg = Person(
            name=name,
            primary_org=chamber,
            district=district,
            party=party,
            image=photo_url)

        leg.add_link(leg_url)
        leg.add_source(leg_url)

        leg_page = lxml.html.fromstring(self.get(link.attrib['href']).text)
        self.scrape_member_page(leg, leg_page)
        yield leg
Пример #14
0
    def scrape_counciler(self, url):
        page = self.lxmlize(url)
        who, = page.xpath("//h3[@class='subtitle']/text()")
        district, = page.xpath("//div[@class='right-bar']//h2/text()")
        image, = page.xpath(
            "//div[@class='left-bar']//a[@class='image lightbox']//img"
        )

        member = Person(
            primary_org='legislature',
            name=who, district=district,
            image=image.attrib['src']
        )
        member.add_source(url)

        details = page.xpath("//table[@align='center']//td")
        for detail in details:
            detail = detail.text_content().strip()
            if detail is None or detail == "":
                continue

            type_, value = detail.split(":", 1)
            cdtype = {
                "Home Phone": "voice",
                "Address": "address",
                "Email": "email",
                "Cell Phone": "voice",
            }[type_]
            member.add_contact_detail(type=cdtype,
                                      note=type_,
                                      value=value)

        yield member
Пример #15
0
    def handle_list_item(self, item):
        photo_url = item.xpath('./img/@src')[0]
        url = item.xpath('.//h5/a/@href')[0]
        name_text = item.xpath('.//h5/a/b/text()')[0]

        name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip('0').upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [x.strip() for x in item.xpath(
            './div/text()[normalize-space()]'
        ) if x.strip()]
        address = '\n'.join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_text = item.xpath('.//a/@href')[1].replace('mailto:', '').strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(name=name, district=district, party=party,
                     primary_org='lower', role='Representative',
                     image=photo_url)
        rep.add_link(url)
        rep.add_contact_detail(type='address', value=address, note='capitol')
        rep.add_contact_detail(type='voice', value=phone, note='capitol')
        rep.add_contact_detail(type='email', value=email, note='capitol')
        rep.add_source(self.url)

        yield rep
Пример #16
0
    def scrape_chamber(self, chamber, session):

        if chamber == 'upper':
            chamber_slug = 'Senate'
        elif chamber == 'lower':
            chamber_slug = 'Assembly'
        session_slug = self.jurisdiction.session_slugs[session]

        leg_base_url = 'http://www.leg.state.nv.us/App/Legislator/A/%s/%s/' % (chamber_slug,
                                                                               session_slug)
        leg_json_url = ('http://www.leg.state.nv.us/App/Legislator/A/api/%s/Legislator?house=%s' %
                        (session_slug, chamber_slug))

        resp = json.loads(self.get(leg_json_url).text)
        for item in resp:
            # empty district
            empty_names = ['District No', 'Vacant']
            if any(name in item['FullName'] for name in empty_names):
                continue
            last, first = item['FullName'].split(",", 1)
            item['FullName'] = "{first} {last}".format(last=last,
                                                       first=first).strip()
            person = Person(name=item['FullName'], district=item['DistrictNbr'],
                            party=item['Party'], primary_org=chamber,
                            image=item['PhotoURL'])
            leg_url = leg_base_url + item['DistrictNbr']

            # hack to get the legislator ID
            html = self.get(leg_url).text
            for l in html.split('\n'):
                if 'GetLegislatorDetails' in l:
                    leg_id = l.split(',')[1].split("'")[1]

            # fetch the json used by the page
            leg_details_url = ('https://www.leg.state.nv.us/App/Legislator/A/api/{}/Legislator?id='
                               .format(session_slug) + leg_id)
            leg_resp = json.loads(self.get(leg_details_url).text)
            details = leg_resp['legislatorDetails']

            address = details['Address1']
            address2 = details['Address2']
            if address2:
                address += ' ' + address2
            address += '\n%s, NV %s' % (details['City'], details['Zip'])

            phone = details['LCBPhone']
            email = details['LCBEmail']
            if address:
                person.add_contact_detail(type='address', value=address,
                                          note='District Office')
            if phone:
                person.add_contact_detail(type='voice', value=phone,
                                          note='District Office')
            if phone:
                person.add_contact_detail(type='email', value=email,
                                          note='District Office')
            person.add_link(leg_details_url)
            person.add_source(leg_details_url)
            yield person
Пример #17
0
    def scrape_chamber(self, chamber):
        self._party_map = {
            'Democrat': 'Democratic',
            'Republican': 'Republican',
            'Non Affiliated': 'Independent',
            'Not Affiliated': 'Independent',
        }

        if chamber == 'upper':
            url = 'http://senate.legis.state.ak.us/'
        else:
            url = 'http://house.legis.state.ak.us/'

        page = self.lxmlize(url)

        items = page.xpath('//ul[@class="item"]')[1].getchildren()

        for item in items:
            photo_url = item.xpath('.//img/@src')[0]
            name = item.xpath('.//strong/text()')[0]
            leg_url = item.xpath('.//a/@href')[0]
            email = item.xpath('.//a[text()="Email Me"]/@href')
            if email:
                email = email[0].replace('mailto:', '')
            else:
                self.warning('no email for ' + name)

            party = district = None
            skip = False

            for dt in item.xpath('.//dt'):
                dd = dt.xpath('following-sibling::dd')[0].text_content()
                label = dt.text.strip()
                if label == 'Party:':
                    party = dd
                elif label == 'District:':
                    district = dd
                elif label.startswith('Deceased'):
                    skip = True
                    self.warning('skipping deceased ' + name)
                    break

            if skip:
                continue

            person = Person(
                primary_org=chamber,
                district=district,
                name=name,
                party=self._party_map[party],
                image=photo_url,
            )
            person.add_source(leg_url)
            person.add_link(leg_url)

            # scrape offices
            self._scrape_offices(person, leg_url, email)

            yield person
Пример #18
0
    def scrape_chamber(self, chamber):
        """
        Scrapes legislators for the current term only
        """
        # self.validate_term(term, latest_only=True)
        url = BASE_URL % CHAMBERS[chamber].lower()
        index = self.get(url).text
        html = lxml.html.fromstring(index)
        html.make_links_absolute(url)

        rows = html.xpath('//div[contains(@class, "row-equal-height")]')

        for row in rows:
            img_url = row.xpath('.//img/@src')[0]

            inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1]
            inner_text = inner.text_content()
            if 'Resigned' in inner_text or 'Substitute' in inner_text:
                continue

            name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip()
            name = re.sub(r'\s+', ' ', name)
            party = PARTY[inner.xpath('p/strong')[0].tail.strip()]
            email = inner.xpath('p/strong/a')[0].text
            district = inner.xpath('p/a')[0].text.replace('District ', '')

            person_url = inner.xpath('p/a/@href')[0]
            # skip roles for now
            role = ''
            # for com in inner.xpath('p/a[contains(@href, "committees")]'):
            #     role = com.tail.strip()

            person = Person(name=name, district=district,
                            party=party, primary_org=chamber,
                            image=img_url, role=role)
            phones = get_phones(inner)
            phone = phones.get('home') or phones.get('business')
            office_phone = phones.get('office')
            address = get_address(inner)
            fax = get_fax(inner)
            if address:
                person.add_contact_detail(type='address', value=address,
                                          note='District Office')
            if phone:
                person.add_contact_detail(type='voice', value=phone,
                                          note='District Office')
            if fax:
                person.add_contact_detail(type='fax', value=fax,
                                          note='District Office')
            if email:
                person.add_contact_detail(type='email', value=email,
                                          note='District Office')
            if office_phone:
                person.add_contact_detail(type='voice', value=office_phone,
                                          note='Capitol Office')
            person.add_source(url)
            person.add_link(person_url)
            yield person
Пример #19
0
def test_basic_invalid_person():
    bob = Person("Bob B. Johnson")
    bob.add_source(url='http://example.com')
    bob.validate()

    bob.name = None

    with pytest.raises(ScrapeValueError):
        bob.validate()
Пример #20
0
def test_basic_invalid_person():
    bob = Person("Bob B. Johnson")
    bob.add_source(url='foo')
    bob.validate()

    bob.name = None

    with pytest.raises(ValidationError):
        bob.validate()
Пример #21
0
    def scrape_lower(self, chamber):
        url = 'http://www.house.mi.gov/mhrpublic/frmRepList.aspx'
        table = [
            "website",
            "district",
            "name",
            "party",
            "location",
            "phone",
            "email"
        ]

        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        # skip two rows at top
        for row in doc.xpath('//table[@id="grvRepInfo"]/*'):
            tds = row.xpath('.//td')
            if len(tds) == 0:
                continue
            metainf = {}
            for i in range(0, len(table)):
                metainf[table[i]] = tds[i]
            district = str(int(metainf['district'].text_content().strip()))
            party = metainf['party'].text_content().strip()
            phone = metainf['phone'].text_content().strip()
            email = metainf['email'].text_content().strip()
            leg_url = metainf['website'].xpath("./a")[0].attrib['href']
            name = metainf['name'].text_content().strip()
            if name == 'Vacant' or re.match(r'^District \d{1,3}$', name):
                self.warning('District {} appears vacant, and will be skipped'.format(district))
                continue

            office = metainf['location'].text_content().strip()
            office = re.sub(
                ' HOB',
                ' Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933',
                office
            )
            office = re.sub(
                ' CB',
                ' State Capitol Building\nLansing, MI 48909',
                office
            )

            photo_url = self.get_photo_url(leg_url)
            person = Person(name=name, district=district, party=abbr[party],
                            primary_org='lower', image=photo_url[0] if photo_url else None)

            person.add_link(leg_url)
            person.add_source(leg_url)

            person.add_contact_detail(type='address', value=office, note='Capitol Office')
            person.add_contact_detail(type='voice', value=phone, note='Capitol Office')
            person.add_contact_detail(type='email', value=email, note='Capitol Office')

            yield person
Пример #22
0
    def scrape_member_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
                "//div[contains(concat(' ', normalize-space(@class), ' '), "
                "' memberModule ')]"
                ):
            img = legislator.xpath(
                ".//div[@class='thumbnail']//img")[0].attrib['src']
            data = legislator.xpath(".//div[@class='data']")[0]
            homepage = data.xpath(".//a[@class='black']")[0]
            full_name = homepage.text_content()

            if "Vacant" in full_name:
                continue

            homepage = homepage.attrib['href']
            party = data.xpath(
                ".//span[@class='partyLetter']")[0].text_content()
            party = {"R": "Republican", "D": "Democratic"}[party]
            office_lines = data.xpath("child::text()")
            phone = office_lines.pop(-1)
            office = "\n".join(office_lines)
            h3 = data.xpath("./h3")
            if len(h3):
                h3 = h3[0]
                district = h3.xpath("./br")[0].tail.replace("District", ""
                                                            ).strip()
            else:
                district = re.findall(
                    r"\d+\.png",
                    legislator.attrib['style']
                )[-1].split(".", 1)[0]

            full_name = re.sub(r"\s+", " ", full_name).strip()
            email = (
                'rep{0:0{width}}@ohiohouse.gov'
                if chamber == 'lower' else
                'sd{0:0{width}}@ohiosenate.gov'
            ).format(int(district), width=2)

            leg = Person(name=full_name, district=district,
                         party=party, primary_org=chamber,
                         image=img)

            leg.add_contact_detail(type='address', value=office, note='Capitol Office')
            leg.add_contact_detail(type='voice', value=phone, note='Capitol Office')
            leg.add_contact_detail(type='email', value=email, note='Capitol Office')

            self.scrape_homepage(leg, chamber, homepage)

            leg.add_source(url)
            leg.add_link(homepage)
            yield leg
Пример #23
0
def test_person_add_membership():
    p = Person('Bob B. Bear')
    p.add_source('http://example.com')
    o = Organization('test org', classification='unknown')
    p.add_membership(o, role='member', start_date='2007')
    assert len(p._related) == 1
    p._related[0].validate()
    assert p._related[0].person_id == p._id
    assert p._related[0].organization_id == o._id
    assert p._related[0].start_date == '2007'
Пример #24
0
    def _scrape_legislator(self, row, chamber):
        name_cell = row.xpath('./td[@class="rosterCell nameCell"]/a')[0]
        name = ' '.join([line.strip() for line
                         in name_cell.text_content().split('\n')
                         if len(line.strip()) > 0])

        party_letter = row.xpath(
            './td[@class="rosterCell partyCell"]/text()')[0].strip()
        party = dict(D='Democratic', R='Republican')[party_letter]

        chamber_abbr = self._chamber_map[chamber]
        district = row.xpath('./td[@class="rosterCell seatCell"]'
                             '/text()')[0].replace(chamber_abbr, '').strip()
        try:
            email = row.xpath('./td[@class="rosterCell emailCell"]'
                              '/a/@href')[0].replace('mailto:', '').strip()
        except IndexError:
            email = None

        phone = row.xpath('./td[@class="rosterCell phoneCell"]'
                          '/text()')[0].strip() or None

        details_url = 'https://leg.mt.gov{}'.format(name_cell.attrib['href'])
        response = self.get(details_url)
        details_page = lxml.html.fromstring(response.text)

        address_lines = details_page.xpath(
            '//div[@class="col-lg-6 col-md-12 text-lg-left align-self-center"]'
            '/p[contains(text(), "Address")]'
            )[0].text_content() \
                .replace('Address', '') \
                .split('\n')
        address = '\n'.join([line.strip() for line in address_lines
                             if len(line.strip()) > 0])

        legislator = Person(name=name,
                            district=district,
                            party=party,
                            primary_org=chamber)

        legislator.add_contact_detail(type='address', value=address,
                                      note='Capitol Office')
        if phone is not None:
            legislator.add_contact_detail(type='voice', value=phone,
                                          note='Capitol Office')

        if email is not None:
            legislator.add_contact_detail(type='email', value=email,
                                          note='E-mail')

        legislator.add_link(details_url)
        legislator.add_source(self._roster_url)

        yield legislator
Пример #25
0
    def scrape_lower_legislator(self, url, leg_info):
        page = self.lxmlize(url)

        name = page.xpath(
            '//span[@id="body_FormView5_FULLNAMELabel"]/text()'
            )[0].strip()
        if name.startswith("District ") or name.startswith("Vacant "):
            self.warning("Seat is vacant: {}".format(name))
            return

        photo = page.xpath(
            '//img[contains(@src, "/h_reps/RepPics")]'
            )[0].attrib['src']
        party_flags = {
            "Democrat": "Democratic",
            "Republican": "Republican",
            "Independent": "Independent"
        }
        party_info = page.xpath(
            '//span[@id="body_FormView5_PARTYAFFILIATIONLabel"]/text()'
            )[0].strip()
        party = party_flags[party_info]
        try:
            email = page.xpath(
                '//span[@id="body_FormView6_EMAILADDRESSPUBLICLabel"]/text()'
                )[0].strip()
        except IndexError:
            email = None
        district = leg_info['dist'].replace('Dist', '').strip()

        person = Person(name=name,
                        party=party,
                        district=district,
                        primary_org='lower',
                        image=photo)

        contacts = [
            (leg_info["office"], "address"),
            (leg_info["phone"], "voice"),
            (email, "email"),
        ]

        for value, key in contacts:
            if value:
                person.add_contact_detail(type=key,
                                          value=value,
                                          note="District Office")

        person.add_source(url)
        person.add_link(url)

        yield person
Пример #26
0
    def scrape_chamber(self, chamber=None):
        metainf = self.scrape_leg_page(get_legislator_listing_url(chamber))
        for leg in metainf:
            try:
                chamber = {"House": "lower",
                           "Senate": "upper"}[leg['chamber']]
            except KeyError:
                print("")
                print("  ERROR: Bad Legislator page.")
                print("    -> " + "\n    -> ".join(leg['source']))
                print("")
                print("  Added this workaround because of a bad legislator")
                print("  page, while they filled their info out.")
                print("")
                print("  Emailed webmaster. Told to wait.")
                print("   - PRT, Jun 23, 2014")
                print("")
                continue

            person = Person(name=leg['name'], district=leg['district'],
                            party=leg['party'], primary_org=chamber,
                            image=leg['image'])

            for source in leg['source']:
                person.add_source(source)

            try:
                for ctty in leg['ctty']:
                    flag = 'Joint Legislative'
                    if ctty['name'][:len(flag)] == flag:
                        ctty_chamber = "joint"
                    else:
                        ctty_chamber = chamber

                    comm = Organization(name=ctty['name'], classification="committee",
                                        chamber=ctty_chamber)
                    comm.add_member(person, role="member")

            except KeyError:
                self.warn("%s has no scraped Committees" % leg['name'])

            person.add_link(leg['homepage'])

            if leg['addr']:
                person.add_contact_detail(type='address', value=leg['addr'], note='Capitol Office')
            if leg['phone']:
                person.add_contact_detail(type='voice', value=leg['phone'], note='Capitol Office')
            if leg['email']:
                person.add_contact_detail(type='email', value=leg['email'], note='Capitol Office')
            if leg['fax']:
                person.add_contact_detail(type='fax', value=leg['fax'], note='Capitol Office')
            yield person
Пример #27
0
def test_save_object_basics():
    # ensure that save object dumps a file
    s = Scraper('jurisdiction', '/tmp/')
    p = Person('Michael Jordan')
    p.add_source('http://example.com')

    with mock.patch('json.dump') as json_dump:
        s.save_object(p)

    # ensure object is saved in right place
    filename = 'person_' + p._id + '.json'
    assert filename in s.output_names['person']
    json_dump.assert_called_once_with(p.as_dict(), mock.ANY, cls=mock.ANY)
Пример #28
0
def test_save_related():
    s = Scraper('jurisdiction', '/tmp/')
    p = Person('Michael Jordan')
    p.add_source('http://example.com')
    o = Organization('Chicago Bulls')
    o.add_source('http://example.com')
    p._related.append(o)

    with mock.patch('json.dump') as json_dump:
        s.save_object(p)

    assert json_dump.mock_calls == [mock.call(p.as_dict(), mock.ANY, cls=mock.ANY),
                                    mock.call(o.as_dict(), mock.ANY, cls=mock.ANY)]
Пример #29
0
    def get_member(self, session, chamber, kpid):
        url = '%smembers/%s' % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)['content']

        party = content['PARTY']
        if party == 'Democrat':
            party = 'Democratic'

        slug = {'2013-2014': 'b2013_14',
                '2015-2016': 'b2015_16',
                '2017-2018': 'b2017_18',
                '2019-2020': 'b2019_20',
                }[session]
        leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug, kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            photo_url, = legislator_page.xpath(
                '//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(content['FULLNAME']))
            leg_url = ''
            photo_url = ''

        person = Person(
            name=content['FULLNAME'],
            district=str(content['DISTRICT']),
            primary_org=chamber,
            party=party,
            image=photo_url,
        )
        person.extras = {'occupation': content['OCCUPATION']}

        address = '\n'.join([
            'Room {}'.format(content['OFFICENUM']),
            'Kansas State Capitol Building',
            '300 SW 10th St.',
            'Topeka, KS 66612',
        ])

        note = 'Capitol Office'
        person.add_contact_detail(type='address', value=address, note=note)
        person.add_contact_detail(type='email', value=content['EMAIL'], note=note)
        if content['OFFPH']:
            person.add_contact_detail(type='voice', value=content['OFFPH'], note=note)

        person.add_source(url)
        person.add_link(leg_url)

        yield person
Пример #30
0
    def scrape_senator(self, district):
        link = "https://legislature.maine.gov/District-{}".format(district)
        page = lxml.html.fromstring(self.get(link).text)
        page.make_links_absolute(link)

        main = page.xpath('//div[@id="main"]/div[@id="content"]')[0]
        title = main.xpath('h1')[0].text
        # e.g. District 25 - State Senator Catherine Breen (D - Cumberland)...
        title_match = re.match(
            r'District (\d+) - State Senator ([^\(]+) \(([DRI])', title)
        _, name, party = title_match.groups()
        name = re.sub(r'\s+', ' ', name.strip())
        party = _party_map[party]

        image_url = address = phone = email = None

        for p in main.xpath('p'):
            if p.xpath('.//img') and not image_url:
                image_url = p.xpath('.//img/@src')[0]
                continue
            field, _, value = p.text_content().partition(":")
            value = value.strip()
            if field in ('Address', 'Mailing Address'):
                address = value
            elif field in ('Phone', 'Home Phone'):
                phone = value
            elif field == 'Email':
                email = value

        person = Person(
            name=name,
            district=district,
            image=image_url,
            primary_org='upper',
            party=party,
        )

        person.add_link(link)
        person.add_source(link)

        if address:
            person.add_contact_detail(type='address', value=address, note='District Office')

        if phone:
            person.add_contact_detail(
                type='voice', value=clean_phone(phone), note='District Phone')
        person.add_contact_detail(type='email', value=email, note='District Email')

        yield person
Пример #31
0
    def scrape_senators(self):
        mapping = {
            'district': 0,
            'first_name': 2,
            'middle_name': 3,
            'last_name': 4,
            'suffixes': 5,
            'party': 1,
            'street_addr': 6,
            'city': 7,
            'state': 8,
            'zip_code': 9,
            'phone1': 10,
            'phone2': 11,
            'email': 12
        }

        url = ('https://mainelegislature.org/uploads/visual_edit/'
               '128th-senate-members-for-distribution-1.xlsx')
        fn, result = self.urlretrieve(url)

        wb = xlrd.open_workbook(fn)
        sh = wb.sheet_by_index(0)

        LEGISLATOR_ROSTER_URL = \
            'https://mainelegislature.org/senate/128th-senators/9332'
        roster_doc = lxml.html.fromstring(self.get(LEGISLATOR_ROSTER_URL).text)
        roster_doc.make_links_absolute(LEGISLATOR_ROSTER_URL)

        for rownum in range(1, sh.nrows):
            # get fields out of mapping
            d = {}
            for field, col_num in mapping.items():
                try:
                    d[field] = str(sh.cell(rownum, col_num).value).strip()
                except IndexError:
                    # This col_num doesn't exist in the sheet.
                    pass
            first_name = d['first_name']
            middle_name = d['middle_name']
            last_name = d['last_name']

            full_name = " ".join((first_name, middle_name, last_name))
            full_name = re.sub(r'\s+', ' ', full_name).strip()

            address = "{street_addr}\n{city}, ME {zip_code}".format(**d)

            phone = d['phone1']
            if not phone:
                phone = d['phone2']
            if not phone:
                phone = None

            district = d['district'].split('.')[0]
            party = d['party'].split('.')[0]

            # Determine legislator's URL to get their photo
            URL_XPATH = '//li/a[contains(text(), "District {:02d}")]/@href'.format(
                int(district))

            try:
                (leg_url, ) = roster_doc.xpath(URL_XPATH)
            except ValueError:
                self.warning('vacant seat %s', district)
                continue  # Seat is vacant

            html = self.get(leg_url).text
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(leg_url)
            xpath = '//img[contains(@src, ".png")]/@src'
            photo_url = doc.xpath(xpath)
            if photo_url:
                photo_url = photo_url.pop()
            else:
                photo_url = None

            person = Person(
                name=full_name,
                district=district,
                image=photo_url,
                primary_org='upper',
                party=party,
            )

            person.add_link(leg_url)
            person.add_source(leg_url)
            person.extras['first_name'] = first_name
            person.extras['middle_name'] = middle_name
            person.extras['last_name'] = last_name

            person.add_contact_detail(type='address',
                                      value=address,
                                      note='District Office')
            if phone:
                person.add_contact_detail(type='voice',
                                          value=clean_phone(phone),
                                          note='District Phone')
            person.add_contact_detail(type='email',
                                      value=d['email'],
                                      note='District Email')

            yield person
Пример #32
0
    def scrape_chamber(self, chamber):
        if chamber == "lower":
            url = "http://www.scstatehouse.gov/member.php?chamber=H"
        else:
            url = "http://www.scstatehouse.gov/member.php?chamber=S"

        seen_committees = {}

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        for a in doc.xpath('//a[@class="membername"]'):
            full_name = a.text
            leg_url = a.get("href")

            if full_name.startswith("Senator"):
                full_name = full_name.replace("Senator ", "")
            if full_name.startswith("Representative"):
                full_name = full_name.replace("Representative ", "")

            leg_html = self.get(leg_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            if "Resigned effective" in leg_html:
                self.info("Resigned")
                continue

            party, district, _ = leg_doc.xpath(
                '//p[@style="font-size: 17px;'
                ' margin: 0 0 0 0; padding: 0;"]/text()')

            if "Republican" in party:
                party = "Republican"
            elif "Democrat" in party:
                party = "Democratic"

            # District # - County - Map
            district = district.split()[1]
            try:
                photo_url = leg_doc.xpath(
                    '//img[contains(@src,"/members/")]/@src')[0]
            except IndexError:
                self.warning("No Photo URL for {}".format(full_name))
                photo_url = ""
            person = Person(
                name=full_name,
                district=district,
                party=party,
                primary_org=chamber,
                image=photo_url,
            )

            # office address / phone
            try:
                addr_div = leg_doc.xpath(
                    '//div[@style="float: left; width: 225px;'
                    ' margin: 10px 5px 0 20px; padding: 0;"]')[0]
                capitol_address = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content()

                phone = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 0 0; padding: 0;"]/text()')[0]
                capitol_phone = phone.strip()

                if capitol_address:
                    person.add_contact_detail(type="address",
                                              value=capitol_address,
                                              note="Capitol Office")

                if capitol_phone:
                    person.add_contact_detail(type="voice",
                                              value=capitol_phone,
                                              note="Capitol Office")
            except IndexError:
                self.warning("no capitol address for {0}".format(full_name))

            # home address / phone
            try:
                addr_div = leg_doc.xpath(
                    '//div[@style="float: left;'
                    ' width: 225px; margin: 10px 0 0 20px;"]')[0]
                addr = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content()

                phone = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 0 0; padding: 0;"]/text()')[0]
                phone = phone.strip()
                if addr:
                    person.add_contact_detail(type="address",
                                              value=addr,
                                              note="District Office")

                if phone:
                    person.add_contact_detail(type="voice",
                                              value=phone,
                                              note="District Office")
            except IndexError:
                self.warning("no district address for {0}".format(full_name))

            person.add_link(leg_url)
            person.add_source(url)
            person.add_source(leg_url)

            # committees (skip first link)
            for com in leg_doc.xpath(
                    '//a[contains(@href, "committee.php")]')[1:]:
                if com.text.endswith(", "):
                    committee, role = com.text_content().rsplit(", ", 1)

                    # known roles
                    role = {
                        "Treas.": "treasurer",
                        "Secy.": "secretary",
                        "Secy./Treas.": "secretary/treasurer",
                        "V.C.": "vice-chair",
                        "1st V.C.": "first vice-chair",
                        "Co 1st V.C.": "co-first vice-chair",
                        "2nd V.C.": "second vice-chair",
                        "3rd V.C.": "third vice-chair",
                        "Ex.Officio Member": "ex-officio member",
                        "Chairman": "chairman",
                    }[role]
                else:
                    committee = com.text
                    role = "member"

                # only yield each committee once
                if committee not in seen_committees:
                    com = Organization(name=committee,
                                       classification="committee",
                                       chamber=chamber)
                    com.add_source(url)
                    seen_committees[committee] = com
                    yield com
                else:
                    com = seen_committees[committee]

                person.add_membership(com, role=role)

            yield person
Пример #33
0
    def scrape_chamber(self, chamber, session):
        chamber_abbrev = {'upper': 'S', 'lower': 'H'}[chamber]

        url = "https://wyoleg.gov/LsoService/api/legislator/2018/{}".format(
            chamber_abbrev)

        response = self.get(url)
        people_json = json.loads(response.content.decode('utf-8'))

        for row in people_json:

            # some fields are only available in the list json, some only in the details call
            details_url = 'https://wyoleg.gov/LsoService/api/legislator/{}'.format(
                row['legID'])
            details_response = self.get(details_url)
            details = json.loads(details_response.content.decode('utf-8'))

            party = self.party_map[row['party']]

            dob = datetime.datetime.strptime(details['dob'],
                                             '%m/%d/%Y %I:%M:%S %p')

            dob_str = datetime.datetime.strftime(dob, "%Y-%m-%d")

            photo_url = 'http://wyoleg.gov/LegislatorSummary/Photos/{}'.format(
                details['legPhoto'])

            person = Person(
                name=row['name'],
                district=row['district'].lstrip('SH0'),
                party=party,
                primary_org=chamber,
                birth_date=dob_str,
                image=photo_url,
            )

            if details['address']:
                address = '{}, {} {} {}'.format(details['address'],
                                                details['city'],
                                                details['state'],
                                                details['zip'])
                person.add_contact_detail(type='address', value=address)

            if row['eMail']:
                person.add_contact_detail(type='email', value=row['eMail'])

            if row['phone']:
                person.add_contact_detail(type='voice', value=row['phone'])

            person.extras['wy_leg_id'] = row['legID']
            person.extras['county'] = row['county']
            person.extras['given_name'] = row['firstName']
            person.extras['family_name'] = row['lastName']
            person.extras['religion'] = details['religion']
            person.extras['number_children'] = details['noChildren']
            person.extras['spouse_given_name'] = details['spouseName']
            person.extras['place_of_birth'] = details['birthPlace']
            person.extras['occupation'] = details['occupationDesc']

            if details['legEducation']:
                person.extras['education'] = details['legEducation']

            if details['civicOrgs']:
                person.extras['civic_organizations'] = details['civicOrgs']

            # http://wyoleg.gov/Legislators/2018/S/2032
            leg_url = 'http://wyoleg.gov/Legislators/{}/{}/{}'.format(
                session, row['party'], row['legID'])

            person.add_source(leg_url)
            person.add_link(leg_url)

            yield person
Пример #34
0
    def scrape_table(self, chamber, tbl):
        # skip first
        for row in tbl.xpath('tr')[1:]:
            leg_a, district, _, _ = row.xpath('td')
            district = district.text
            name = leg_a.text_content().strip()
            if name.lower() == "to be announced":
                continue
            leg_url = leg_a.xpath('a/@href')[0]

            # get details
            html = self.get(leg_url).text
            ldoc = lxml.html.fromstring(html)
            ldoc.make_links_absolute(leg_url)

            party = _get_table_item(ldoc, 'Party Affiliation:').text
            if party == 'Democrat':
                party = 'Democratic'
            addr_lines = _get_table_item(ldoc,
                                         'Annapolis Address:').xpath('text()')
            address = []
            for line in addr_lines:
                if 'Phone:' not in line:
                    address.append(line)
                else:
                    phone = line
            address = '\n'.join(address)
            try:
                phone = re.findall('Phone: (\d{3}-\d{3}-\d{4})', phone)[0]
            except IndexError:
                self.warning("Missing phone!")
                phone = None

            email = ldoc.xpath('//a[contains(@href, "mailto:")]/text()')
            if not email:
                email = None
            elif len(email) == 1:
                email = email[0].strip()
            else:
                raise AssertionError('Multiple email links found on page')

            img_src = ldoc.xpath('//img[@class="sponimg"]/@src')
            if img_src:
                photo_url = img_src[0]

            leg = Person(primary_org=chamber,
                         district=district,
                         name=name,
                         party=party,
                         image=photo_url)
            leg.add_source(url=leg_url)
            leg.add_link(url=leg_url)

            # type ['address', 'email', 'url', 'fax', 'text', 'voice', 'video', 'pager', 'textphone']
            if address:
                leg.add_contact_detail(type='address',
                                       value=address or None,
                                       note='Capitol Office')
            if phone:
                leg.add_contact_detail(type='voice',
                                       value=phone,
                                       note='Capitol Office')

            if email:
                leg.add_contact_detail(type='email',
                                       value=email,
                                       note='Capitol Office')

            yield leg
Пример #35
0
    def _parse_person(self, row, chamber, seat_map):
        # Capture legislator vitals.
        first_name = row["FirstName"]
        middle_name = row["MiddleName"]
        last_name = row["LastName"]
        full_name = "{} {} {}".format(first_name, middle_name, last_name)
        full_name = re.sub(r"[\s]{2,}", " ", full_name)

        if chamber == "lower":
            district = "{} {}".format(row["County"],
                                      int(row["District"])).strip()
        else:
            district = str(int(row["District"])).strip()

        party = self.party_map[row["party"].upper()]
        email = row["WorkEmail"]

        if district == "0":
            self.warning("Skipping {}, district is set to 0".format(full_name))
            return

        person = Person(primary_org=chamber,
                        district=district,
                        name=full_name,
                        party=party)

        extras = {
            "first_name": first_name,
            "middle_name": middle_name,
            "last_name": last_name,
        }

        person.extras = extras
        if email:
            office = "Capitol" if email.endswith(
                "@leg.state.nh.us") else "District"
            person.add_contact_detail(type="email",
                                      value=email,
                                      note=office + " Office")

        # Capture legislator office contact information.
        district_address = "{}\n{}\n{}, {} {}".format(row["Address"],
                                                      row["address2"],
                                                      row["city"],
                                                      row["State"],
                                                      row["Zipcode"]).strip()

        phone = row["Phone"].strip()
        if not phone:
            phone = None

        if district_address:
            office = "Capitol" if chamber == "upper" else "District"
            person.add_contact_detail(type="address",
                                      value=district_address,
                                      note=office + " Office")
        if phone:
            office = "Capitol" if "271-" in phone else "District"
            person.add_contact_detail(type="voice",
                                      value=phone,
                                      note=office + " Office")

        # Retrieve legislator portrait.
        profile_url = None
        if chamber == "upper":
            profile_url = self.senate_profile_url.format(row["District"])
        elif chamber == "lower":
            try:
                seat_number = seat_map[row["seatno"]]
                profile_url = self.house_profile_url.format(seat_number)
            except KeyError:
                pass

        if profile_url:
            person.image = self._get_photo(profile_url, chamber)
            person.add_source(profile_url)

        return person
Пример #36
0
    def scrape_chamber(self, chamber):
        body = {"lower": "H", "upper": "S"}[chamber]
        url = "http://www.azleg.gov/MemberRoster/?body=" + body
        page = self.get(url).text

        # there is a bad comment closing tag on this page
        page = page.replace("--!>", "-->")

        root = html.fromstring(page)

        path = "//table//tr"
        roster = root.xpath(path)[1:]
        for row in roster:
            position = ""
            name, district, party, email, room, phone, = row.xpath("td")

            if email.attrib.get("class") == "vacantmember":
                continue  # Skip any vacant members.

            link = name.xpath("string(a/@href)")
            if len(name) == 1:
                name = name.text_content().strip()
            else:
                position = name.tail.strip()
                name = name[0].text_content().strip()
            if "--" in name:
                name = name.split("--")[0].strip()

            linkpage = self.get(link).text
            linkpage = linkpage.replace("--!>", "-->")
            linkroot = html.fromstring(linkpage)
            linkroot.make_links_absolute(link)

            photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]")

            if len(photos) != 1:
                self.warning("no photo on " + link)
                photo_url = ""
            else:
                photo_url = photos[0].attrib["src"]

            district = district.text_content().strip()
            party = party.text_content().strip()
            email = email.text_content().strip()

            if email.startswith("Email: "):
                email = email.replace("Email: ", "").lower() + "@azleg.gov"
            else:
                email = ""

            party = self.get_party(party)
            room = room.text_content().strip()
            if chamber == "lower":
                address = "House of Representatives\n"
            else:
                address = "Senate\n"
            address = (address + "1700 West Washington\n Room " + room +
                       "\nPhoenix, AZ 85007")

            phone = phone.text_content().strip()
            if "602" not in re.findall(r"(\d+)", phone):
                phone = "602-" + phone

            leg = Person(
                primary_org=chamber,
                image=photo_url,
                name=name,
                district=district,
                party=party,
            )
            leg.add_contact_detail(type="address",
                                   value=address,
                                   note="Capitol Office")
            leg.add_contact_detail(type="voice",
                                   value=phone,
                                   note="Capitol Office")
            leg.add_party(party=party)
            leg.add_link(link)

            if email:
                leg.add_contact_detail(type="email", value=email)
            if position:
                leg.add_membership(name_or_org=party, role=position)
                # leg.add_role(position, term, chamber=chamber,
                #             district=district, party=party)

            leg.add_source(url)

            # Probably just get this from the committee scraper
            # self.scrape_member_page(link, session, chamber, leg)
            yield leg
Пример #37
0
    def scrape_lower_chamber(self, term):
        url = "https://www.okhouse.gov/Members/Default.aspx"
        page = self.curl_lxmlize(url)

        legislator_nodes = self.get_nodes(
            page, '//table[@id="ctl00_ContentPlaceHolder1_RadGrid1_ctl00"]/tbody/tr'
        )

        for legislator_node in legislator_nodes:
            name_node = self.get_node(legislator_node, ".//td[1]/a")

            if name_node is not None:
                name_text = name_node.text.strip()

                # Handle seats with no current representative
                if re.search(r"District \d+", name_text):
                    continue

                last_name, delimiter, first_name = name_text.partition(",")

                if last_name is not None and first_name is not None:
                    first_name = first_name.strip()
                    last_name = last_name.strip()
                    name = " ".join([first_name, last_name])
                else:
                    raise ValueError("Unable to parse name: {}".format(name_text))

                if name.startswith("House District"):
                    continue

            district_node = self.get_node(legislator_node, ".//td[3]")

            if district_node is not None:
                district = district_node.text.strip()

            party_node = self.get_node(legislator_node, ".//td[4]")

            if party_node is not None:
                party_text = party_node.text.strip()

            party = self._parties[party_text]

            legislator_url = (
                "https://www.okhouse.gov/Members/District.aspx?District=" + district
            )
            legislator_page = self.curl_lxmlize(legislator_url)

            photo_url = self.get_node(
                legislator_page, '//a[@id="ctl00_ContentPlaceHolder1_imgHiRes"]/@href'
            )

            person = Person(
                primary_org="lower",
                district=district,
                name=name,
                party=party,
                image=photo_url,
            )
            person.extras["_scraped_name"] = name_text
            person.add_link(legislator_url)
            person.add_source(url)
            person.add_source(legislator_url)

            # Scrape offices.
            self.scrape_lower_offices(legislator_page, person, district)

            yield person
Пример #38
0
    def legislators(self, latest_only):
        legs = {}

        for member, chamber, term, url in self._memberships(latest_only):
            name, _, _, district, party = member.xpath("td")
            district = district.text
            detail_url = name.xpath("a/@href")[0]

            if party.text_content().strip() == "":
                self.warning("Garbage party: Skipping!")
                continue

            party = {
                "D": "Democratic",
                "R": "Republican",
                "I": "Independent"
            }[party.text]
            name = name.text_content().strip()

            # inactive legislator, skip them for now
            if name.endswith("*"):
                name = name.strip("*")
                continue

            name = AKA.get(name, name)

            if name in legs:
                p, terms = legs[name]
                terms.append((chamber, district, term, party))
            else:
                p = Person(name, party=party)
                legs[name] = p, [(chamber, district, term, party)]

            p.add_source(url)
            p.add_source(detail_url)
            p.add_link(detail_url)

            birth_date = BIRTH_DATES.get(name, None)
            if birth_date:
                p.birth_date = birth_date

            leg_html = self.get(detail_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(detail_url)

            hotgarbage = ("Senate Biography Information for the 98th General "
                          "Assembly is not currently available.")

            if hotgarbage in leg_html:
                # The legislator's bio isn't available yet.
                self.logger.warning("No legislator bio available for " + name)
                continue

            photo_url = leg_doc.xpath(
                '//img[contains(@src, "/members/")]/@src')[0]
            p.image = photo_url

            p.contact_details = []
            # email
            email = leg_doc.xpath('//b[text()="Email: "]')
            if email:
                p.add_contact_detail(type="email",
                                     value=email[0].tail.strip(),
                                     note="capitol")

            offices = {
                "capitol": '//table[contains(string(), "Springfield Office")]',
                "district": '//table[contains(string(), "District Office")]',
            }

            for location, xpath in offices.items():
                table = leg_doc.xpath(xpath)
                if table:
                    for type, value in self._table_to_office(table[3]):
                        if type in ("fax", "voice"
                                    ) and not validate_phone_number(value):
                            continue

                        p.add_contact_detail(type=type,
                                             value=value,
                                             note=location)

        return legs
Пример #39
0
    def scrape_chamber(self, chamber):
        if chamber == 'lower':
            url = 'http://www.scstatehouse.gov/member.php?chamber=H'
        else:
            url = 'http://www.scstatehouse.gov/member.php?chamber=S'

        seen_committees = {}

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        for a in doc.xpath('//a[contains(@href, "code=")]'):
            full_name = a.text
            leg_url = a.get('href')

            leg_html = self.get(leg_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            if 'Resigned effective' in leg_html:
                self.info('Resigned')
                continue

            party, district, _ = leg_doc.xpath(
                '//p[@style="font-size: 17px;'
                ' margin: 0 0 0 0; padding: 0;"]/text()')

            if 'Republican' in party:
                party = 'Republican'
            elif 'Democrat' in party:
                party = 'Democratic'

            # District # - County - Map
            district = district.split()[1]
            try:
                photo_url = leg_doc.xpath(
                    '//img[contains(@src,"/members/")]/@src')[0]
            except IndexError:
                self.warning("No Photo URL for {}".format(full_name))
                photo_url = ''
            person = Person(name=full_name,
                            district=district,
                            party=party,
                            primary_org=chamber,
                            image=photo_url)

            # office address / phone
            try:
                addr_div = leg_doc.xpath(
                    '//div[@style="float: left; width: 225px;'
                    ' margin: 10px 5px 0 20px; padding: 0;"]')[0]
                capitol_address = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content()

                phone = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 0 0; padding: 0;"]/text()')[0]
                capitol_phone = phone.strip()

                if capitol_address:
                    person.add_contact_detail(type='address',
                                              value=capitol_address,
                                              note='Capitol Office')

                if capitol_phone:
                    person.add_contact_detail(type='voice',
                                              value=capitol_phone,
                                              note='Capitol Office')
            except IndexError:
                self.warning('no capitol address for {0}'.format(full_name))

            # home address / phone
            try:
                addr_div = leg_doc.xpath(
                    '//div[@style="float: left;'
                    ' width: 225px; margin: 10px 0 0 20px;"]')[0]
                addr = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content()

                phone = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 0 0; padding: 0;"]/text()')[0]
                phone = phone.strip()
                if addr:
                    person.add_contact_detail(type='address',
                                              value=addr,
                                              note='District Office')

                if phone:
                    person.add_contact_detail(type='voice',
                                              value=phone,
                                              note='District Office')
            except IndexError:
                self.warning('no district address for {0}'.format(full_name))

            person.add_link(leg_url)
            person.add_source(url)
            person.add_source(leg_url)

            # committees (skip first link)
            for com in leg_doc.xpath(
                    '//a[contains(@href, "committee.php")]')[1:]:
                if com.text.endswith(', '):
                    committee, role = com.text_content().rsplit(', ', 1)

                    # known roles
                    role = {
                        'Treas.': 'treasurer',
                        'Secy.': 'secretary',
                        'Secy./Treas.': 'secretary/treasurer',
                        'V.C.': 'vice-chair',
                        '1st V.C.': 'first vice-chair',
                        'Co 1st V.C.': 'co-first vice-chair',
                        '2nd V.C.': 'second vice-chair',
                        '3rd V.C.': 'third vice-chair',
                        'Ex.Officio Member': 'ex-officio member',
                        'Chairman': 'chairman'
                    }[role]
                else:
                    committee = com.text
                    role = 'member'

                # only yield each committee once
                if committee not in seen_committees:
                    com = Organization(name=committee,
                                       classification='committee',
                                       chamber=chamber)
                    com.add_source(url)
                    seen_committees[committee] = com
                    yield com
                else:
                    com = seen_committees[committee]

                person.add_membership(com, role=role)

            yield person
Пример #40
0
    def scrape_upper(self, chamber):
        url = 'http://www.senate.michigan.gov/senatorinfo_list.html'
        url_to_append = 'http://www.senate.michigan.gov/_images/'
        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        for row in doc.xpath('//table[not(@class="calendar")]//tr')[3:]:
            if len(row) != 7:
                continue

            # party, dist, member, office_phone, office_fax, office_loc
            party, dist, member, contact, phone, fax, loc = row.getchildren()
            if (party.text_content().strip() == ""
                    or 'Lieutenant Governor' in member.text_content()):
                continue

            party = abbr[party.text]
            district = dist.text_content().strip()
            name = member.text_content().strip()
            name = re.sub(r'\s+', " ", name)
            surname = re.split(', | ', name)
            surname[0] = re.sub('[\']', '', surname[0])
            try:
                self.head(url_to_append + surname[0] + '.png')
                photo_url = url_to_append + surname[0] + '.png'
            except scrapelib.HTTPError:
                try:
                    self.head(url_to_append + surname[0] + '.jpg')
                    photo_url = url_to_append + surname[0] + '.jpg'
                except scrapelib.HTTPError:
                    photo_url = None

            if name == 'Vacant':
                self.info('district %s is vacant', district)
                continue

            leg_url = member.xpath('a/@href')[0]
            office_phone = phone.text
            office_fax = fax.text

            office_loc = loc.text
            office_loc = re.sub(
                ' Farnum Bldg',
                ' Farnum Office Building\n125 West Allegan Street\nLansing, MI 48933',
                office_loc)
            office_loc = re.sub(' Capitol Bldg',
                                ' State Capitol Building\nLansing, MI 48909',
                                office_loc)

            # email addresses aren't on the list page anymore but they
            # are on the page linked off "Contact Me"

            # data has a typo in a row
            contact_url = [
                a for a in row.xpath(".//a")
                if a.text in ('Contact Me', 'Conact Me')
            ][0].get('href')
            contact_html = self.get(contact_url).text
            contact_doc = lxml.html.fromstring(contact_html)

            email = None
            header_email = contact_doc.xpath("//a[@class='header_email']")
            if header_email:
                email = header_email[0].text
            else:
                # not using the most common template, but maybe they
                # dropped their email on the page somewhere
                links = contact_doc.xpath('//a') or []
                text_email = [
                    a for a in links if 'mailto:' in (a.get('href') or '')
                ]
                if text_email:
                    email = text_email[0].text

            person = Person(name=name,
                            district=district,
                            party=party,
                            primary_org='upper',
                            image=photo_url)

            person.add_link(leg_url)
            person.add_source(leg_url)

            person.add_contact_detail(type='address',
                                      value=office_loc,
                                      note='Capitol Office')
            person.add_contact_detail(type='voice',
                                      value=office_phone,
                                      note='Capitol Office')
            person.add_contact_detail(type='fax',
                                      value=office_fax,
                                      note='Capitol Office')
            if email:
                person.add_contact_detail(type='email',
                                          value=email,
                                          note='Capitol Office')

            yield person
Пример #41
0
    def scrape(self):
        body_types = self.body_types()
        city_council, = [body for body in self.bodies()
                         if body["BodyName"] == "City Council"]
        terms = collections.defaultdict(list)

        for office in self.body_offices(city_council):
            if "VACAN" not in office["OfficeRecordFullName"]:
                terms[office["OfficeRecordFullName"].strip()].append(office)

        web_scraper = LegistarPersonScraper(requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = "https://pittsburgh.legistar.com/People.aspx"
        web_scraper.COMMITTEELIST = "https://pittsburgh.legistar.com/Departments.aspx"

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}
        for member in web_scraper.councilMembers():
            web_info[member["Person Name"]] = member

        members = {}
        for member, offices in terms.items():
            person = Person(member)
            for term in offices:
                role = term["OfficeRecordTitle"]
                person.add_term("Councilmember",
                                "legislature",
                                start_date = self.toDate(term["OfficeRecordStartDate"]),
                                end_date = self.toDate(term["OfficeRecordEndDate"]))

            if member in web_info:
                web = web_info[member]
                if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != "N/A":
                    person.add_contact_detail(type="email",
                                        value=web["E-mail"]["label"],
                                        note="E-mail")

            person_source_data = self.person_sources_from_office(term)
            person_api_url, person_api_response = person_source_data
            person.add_source(person_api_url, note="api")

            if person_api_response["PersonAddress1"]:
                address = (person_api_response["PersonAddress1"] + ", " + person_api_response["PersonCity1"]
                          + ", " + person_api_response["PersonState1"] + " " + person_api_response["PersonZip1"])
                person.add_contact_detail(type="address",
                                    value=address,
                                    note="Office address")

            if person_api_response["PersonPhone"]:
                person.add_contact_detail(type="voice",
                                    value=person_api_response["PersonPhone"],
                                    note="Office phone")

            if person_api_response["PersonWWW"]:
                person.add_contact_detail(type="url",
                                    value=person_api_response["PersonWWW"],
                                    note="District website")

            members[member] = person


        for body in self.bodies():
            if body["BodyTypeId"] == body_types["Committee"]:
                body_name_clean = body["BodyName"].strip()
                organization = Organization(body_name_clean,
                             classification="committee",
                             parent_id={"name" : "Pittsburgh City Council"})

                organization.add_source(self.BASE_URL + "/bodies/{BodyId}".format(**body), note="api")

                for office in self.body_offices(body):
                    role = office["OfficeRecordMemberType"]
                    if role not in ("Vice Chair", "Chair") or role == "Councilmember":
                        role = "Member"

                    person = office["OfficeRecordFullName"].strip()
                    if person in members:
                        person = members[person]
                    else:
                        person = Person(person)

                    person.add_membership(body_name_clean,
                                     role=role,
                                     start_date = self.toDate(office["OfficeRecordStartDate"]),
                                     end_date = self.toDate(office["OfficeRecordEndDate"]))

                yield organization

        for person in members.values():
            yield person
Пример #42
0
    def scrape_lower(self, chamber):
        url = 'http://www.house.mi.gov/mhrpublic/frmRepList.aspx'
        table = [
            "website", "district", "name", "party", "location", "phone",
            "email"
        ]

        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        # skip two rows at top
        for row in doc.xpath('//table[@id="grvRepInfo"]/*'):
            tds = row.xpath('.//td')
            if len(tds) == 0:
                continue
            metainf = {}
            for i in range(0, len(table)):
                metainf[table[i]] = tds[i]
            district = str(int(metainf['district'].text_content().strip()))
            party = metainf['party'].text_content().strip()
            phone = metainf['phone'].text_content().strip()
            email = metainf['email'].text_content().strip()
            name = metainf['name'].text_content().strip()
            if name == 'Vacant' or re.match(r'^District \d{1,3}$', name):
                self.warning(
                    'District {} appears vacant, and will be skipped'.format(
                        district))
                continue
            leg_url = metainf['website'].xpath("./a")[0].attrib['href']

            office = metainf['location'].text_content().strip()
            office = re.sub(
                ' HOB',
                ' Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933',
                office)
            office = re.sub(' CB',
                            ' State Capitol Building\nLansing, MI 48909',
                            office)

            try:
                photo_url = self.get_photo_url(leg_url)[0]
            except (scrapelib.HTTPError, IndexError):
                photo_url = ''
                self.warning('no photo url for %s', name)

            person = Person(name=name,
                            district=district,
                            party=abbr[party],
                            primary_org='lower',
                            image=photo_url)

            person.add_link(leg_url)
            person.add_source(leg_url)

            person.add_contact_detail(type='address',
                                      value=office,
                                      note='Capitol Office')
            person.add_contact_detail(type='voice',
                                      value=phone,
                                      note='Capitol Office')
            person.add_contact_detail(type='email',
                                      value=email,
                                      note='Capitol Office')

            yield person
Пример #43
0
    def scrape(self):
        committee_d = {}
        non_committees = ('City Council', 'Office of the Mayor')

        for councilman, committees in self.councilMembers():
            if councilman['Ward/Office'] == "":
                continue

            ward = councilman['Ward/Office']
            if ward not in [
                    "Mayor",
                    "Clerk",
            ]:
                ward = "Ward {}".format(int(ward))

            p = Person(councilman['Person Name']['label'],
                       district=ward,
                       primary_org="legislature")

            if councilman['Photo']:
                p.image = councilman['Photo']

            contact_types = {
                "City Hall Office": ("address", "City Hall Office"),
                "City Hall Phone": ("voice", "City Hall Phone"),
                "Ward Office Phone": ("voice", "Ward Office Phone"),
                "Ward Office Address": ("address", "Ward Office Address"),
                "Fax": ("fax", "Fax")
            }

            for contact_type, (type_, _note) in contact_types.items():
                if councilman[contact_type]:
                    p.add_contact_detail(type=type_,
                                         value=councilman[contact_type],
                                         note=_note)

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['label'],
                                     note='E-mail')

            if councilman['Website']:
                p.add_link(councilman['Website']['url'])
            p.add_source(MEMBERLIST)

            for committee, _, _ in committees:
                committee_name = committee['Legislative Body']['label']
                if committee_name and committee_name not in non_committees:
                    o = committee_d.get(committee_name, None)
                    if o is None:
                        o = Organization(committee_name,
                                         classification='committee')
                        o.add_source(
                            "https://chicago.legistar.com/Departments.aspx")
                        committee_d[committee_name] = o

                    o.add_member(p, role=committee["Title"])
            yield p

        for o in committee_d.values():
            yield o
Пример #44
0
    def scrape_reps(self):
        url = 'http://www.maine.gov/legis/house/dist_mem.htm'
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        # These do not include the non-voting tribal representatives
        # They do not have numbered districts, and lack a good deal of
        # the standard profile information about representatives
        for district in page.xpath('//a[contains(@href, "dist_twn")]/..'):
            if "- Vacant" in district.text_content():
                self.warning("District is vacant: '{}'".format(
                    district.text_content()))
                continue

            _, district_number = district.xpath('a[1]/@href')[0].split('#')

            leg_url = district.xpath('a[2]/@href')[0]
            leg_info = district.xpath('a[2]/text()')[0]

            INFO_RE = r'''
                    Representative\s
                    (?P<member_name>.+?)
                    \s\(
                    (?P<party>[DRCUIG])
                    -
                    (?P<district_name>.+?)
                    \)
                    '''
            info_search = re.search(INFO_RE, leg_info, re.VERBOSE)

            member_name = info_search.group('member_name')
            party = _party_map[info_search.group('party')]
            district_name = info_search.group('district_name')

            # Get the photo url.
            html = self.get(leg_url).text
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(leg_url)
            (photo_url, ) = doc.xpath('//img[contains(@src, ".jpg")]/@src')

            # Add contact information from personal page
            office_address = re.search(r'<B>Address:  </B>(.+?)\n?</?P>', html,
                                       re.IGNORECASE).group(1)

            office_email = doc.xpath(
                '//a[starts-with(@href, "mailto:")]/text()')
            business_phone = re.search(
                r'<B>Business Telephone:  </B>(.+?)</?P>', html, re.IGNORECASE)
            home_phone = re.search(r'<B>Home Telephone:  </B>(.+?)</?P>', html,
                                   re.IGNORECASE)
            cell_phone = re.search(r'<B>Cell Telephone:  </B>(.+?)</?P>', html,
                                   re.IGNORECASE)

            person = Person(
                name=member_name,
                district=district_number,
                primary_org='lower',
                party=party,
                image=photo_url,
            )
            person.extras['district_name'] = district_name

            person.add_link(leg_url)
            person.add_source(leg_url)

            if office_address:
                leg_address = office_address
                person.add_contact_detail(type='address',
                                          value=leg_address,
                                          note='District Office')
            else:
                # If no address for legislator
                if party == 'Democratic':
                    leg_address = (
                        'House Democratic Office, Room 333 State House, 2 State House Station, '
                        'Augusta, Maine 04333-0002')

                    person.add_contact_detail(type='address',
                                              value=leg_address,
                                              note='Party Office')

                elif party == 'Republican':
                    leg_address = (
                        'House GOP Office, Room 332 State House, 2 State House Station, '
                        'Augusta, Maine 04333-0002')

                    person.add_contact_detail(type='address',
                                              value=leg_address,
                                              note='Party Office')

            if office_email:
                office_email = office_email[0]
                person.add_contact_detail(type='email',
                                          value=office_email,
                                          note='District Office')
            if business_phone:
                person.add_contact_detail(type='voice',
                                          value=clean_phone(
                                              business_phone.group(1)),
                                          note='Business Phone')
            if home_phone:
                person.add_contact_detail(type='voice',
                                          value=clean_phone(
                                              home_phone.group(1)),
                                          note='Home Phone')
            if cell_phone:
                person.add_contact_detail(type='voice',
                                          value=clean_phone(
                                              cell_phone.group(1)),
                                          note='Cell Phone')

            yield person
Пример #45
0
    def scrape(self):
        committee_d = {}
        non_committees = {
            'City Council', 'Office of the Mayor', 'Office of the City Clerk'
        }

        for councilman, committees in self.councilMembers():
            if councilman['Ward/Office'] == "":
                continue

            ward = councilman['Ward/Office']
            if ward not in {"Mayor", "Clerk"}:

                ward = "Ward {}".format(int(ward))
                role = "Alderman"
                p = Person(councilman['Person Name']['label'],
                           district=ward,
                           primary_org="legislature",
                           role=role)

            if councilman['Photo']:
                p.image = councilman['Photo']

            contact_types = {
                "City Hall Office": ("address", "City Hall Office"),
                "City Hall Phone": ("voice", "City Hall Phone"),
                "Ward Office Phone": ("voice", "Ward Office Phone"),
                "Ward Office Address": ("address", "Ward Office Address"),
                "Fax": ("fax", "Fax")
            }

            for contact_type, (type_, _note) in contact_types.items():
                if councilman[contact_type]:
                    p.add_contact_detail(type=type_,
                                         value=councilman[contact_type],
                                         note=_note)

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['label'],
                                     note='E-mail')

            if councilman['Website']:
                p.add_link(councilman['Website']['url'])
            p.add_source(councilman['Person Name']['url'], note='web')

            for committee, _, _ in committees:
                committee_name = committee['Legislative Body']['label']
                if committee_name and committee_name not in non_committees:
                    o = committee_d.get(committee_name, None)
                    if o is None:
                        o = Organization(
                            committee_name,
                            classification='committee',
                            parent_id={'name': 'Chicago City Council'})
                        o.add_source(committee['Legislative Body']['url'],
                                     note='web')
                        committee_d[committee_name] = o

                    o.add_member(p, role=committee["Title"])

            yield p

        for name, term in FORMER_ALDERMEN.items():
            p = Person(name=name,
                       primary_org="legislature",
                       start_date=term['term'][0],
                       end_date=term['term'][1],
                       district="Ward {}".format(term['ward']),
                       role='Alderman')
            if name == 'Chandler, Michael D.':
                p.add_term('Alderman',
                           "legislature",
                           district="Ward {}".format(term['ward']),
                           start_date=datetime.date(2011, 5, 16),
                           end_date=datetime.date(2015, 5, 18))

            p.add_source(term['source'], note='web')
            yield p

        for o in committee_d.values():
            yield o

        for committee_name in FORMER_COMMITTEES:
            o = Organization(committee_name,
                             classification='committee',
                             parent_id={'name': 'Chicago City Council'})
            o.add_source("https://chicago.legistar.com/Departments.aspx",
                         note='web')
            yield o

        for joint_committee in JOINT_COMMITTEES:

            o = Organization(joint_committee,
                             classification='committee',
                             parent_id={'name': 'Chicago City Council'})
            o.add_source("https://chicago.legistar.com/Departments.aspx",
                         note='web')
            yield o
Пример #46
0
    def scrape_member(self, chamber, member_url):
        page = self.get(member_url).text
        root = lxml.html.fromstring(page)
        root.make_links_absolute(member_url)

        photo_url = root.xpath('//div[@class="thumbPhoto"]/img/@src')[0]
        full_name = root.xpath('//h1/span')[0].tail.strip()

        try:
            email = root.xpath('//a[contains(@href, "mailto")]/@href')[0]
            email = email.replace('mailto:', '')
        except:
            email = ''
            self.info("seat may be vacant")

        party, district = root.xpath('//h1/span')[1].text.split('-')
        party = party.strip()
        district = clean_district(district.strip())

        if party in ('D', 'Democrat', 'Democratic'):
            party = 'Democratic'
        elif party in ('R', 'Republican'):
            party = 'Republican'
        else:
            party = 'Other'

        leg = Person(primary_org=chamber,
                     district=district,
                     name=full_name,
                     party=party,
                     image=photo_url)
        leg.add_link(member_url)
        leg.add_source(member_url)

        leg.add_contact_detail(type='email',
                               value=email,
                               note='District Office')

        # offices
        for addr in root.xpath('//address/div[@class="contactGroup"]'):
            office_name = addr.xpath(
                '../preceding-sibling::h4/text()')[0].strip()
            if 'District' in office_name:
                note = 'District Office'
            elif 'State' in office_name:
                note = 'Capitol office'
            try:
                address = addr.xpath('a')[0].text_content()
                address = re.sub('\s{2,}', '\n', address)
                leg.add_contact_detail(type='address',
                                       value=address,
                                       note=note)
            except:
                self.warning("No address info found in `contactGroup`")
            next = None
            for phonerow in addr.xpath('./div/div'):
                phonerow = phonerow.text_content().strip()
                if phonerow == 'Phone:':
                    next = 'voice'
                elif phonerow == 'Fax:':
                    next = 'fax'
                elif next == 'voice':
                    leg.add_contact_detail(type='voice',
                                           value=phonerow,
                                           note=note)
                    next = None
                elif next == 'fax':
                    leg.add_contact_detail(type='fax',
                                           value=phonerow,
                                           note=note)
                    next = None
                else:
                    self.warning('unknown phonerow %s', phonerow)

        return leg
Пример #47
0
    def scrape(self, session=None):
        if not session:
            session = self.jurisdiction.legislative_sessions[-1]['name']
            self.info('no session specified, using %s', session)

        year_abr = session[0:4]

        self._init_mdb(year_abr)

        roster_csv = self.access_to_csv('Roster')
        bio_csv = self.access_to_csv('LegBio')

        photos = {}
        for rec in bio_csv:
            photos[rec['Roster Key']] = rec['URLPicture']

        for rec in roster_csv:
            first_name = rec["Firstname"]
            middle_name = rec["MidName"]
            last_name = rec["LastName"]
            suffix = rec["Suffix"]
            full_name = first_name + " " + middle_name + " " + last_name + " " + suffix
            full_name = full_name.replace('  ', ' ')
            full_name = full_name[0: len(full_name) - 1]

            district = str(int(rec["District"]))
            party = rec["Party"]
            if party == 'R':
                party = "Republican"
            elif party == 'D':
                party = "Democratic"
            else:
                party = party
            chamber = rec["House"]
            if chamber == 'A':
                chamber = "lower"
            elif chamber == 'S':
                chamber = "upper"

            leg_status = rec["LegStatus"]
            # skip Deceased/Retired members
            if leg_status != 'Active':
                continue
            phone = rec["Phone"] or None
            email = None
            if rec["Email"]:
                email = rec["Email"]
            try:
                photo_url = photos[rec['Roster Key']]
            except KeyError:
                photo_url = ''
                self.warning('no photo url for %s', rec['Roster Key'])
            url = ('http://www.njleg.state.nj.us/members/bio.asp?Leg=' +
                   str(int(rec['Roster Key'])))
            address = '{0}\n{1}, {2} {3}'.format(rec['Address'], rec['City'],
                                                 rec['State'], rec['Zipcode'])
            gender = {'M': 'Male', 'F': 'Female'}[rec['Sex']]

            person = Person(
                name=full_name,
                district=district,
                primary_org=chamber,
                party=party,
                image=photo_url,
                gender=gender,
            )

            person.add_link(url)
            person.add_source(url)
            person.add_source('http://www.njleg.state.nj.us/downloads.asp')

            person.add_contact_detail(type='address', value=address, note='District Office')
            if phone is not None:
                person.add_contact_detail(type='voice', value=phone, note='District Office')
            if email is not None:
                person.add_contact_detail(type='email', value=email, note='District Office')

            yield person
Пример #48
0
    def _scrape_upper(self, roster_page, roster_url):
        """
        Retrieves a list of members of the upper legislative chamber.
        """
        # TODO: photo_urls https://senate.texas.gov/members.php
        #       also available on individual member screens
        # TODO: email addresses could be scraped from secondary sources
        #       https://github.com/openstates/openstates/issues/1292

        for tbl in roster_page.xpath('//table[@class="memdir"]'):
            # Scrape legislator information from roster URL
            leg_a = tbl.xpath('.//a')[0]
            name = leg_a.text
            # Skip vacant districts
            if re.search(r'district \d+ constituent services', name,
                         re.IGNORECASE):
                continue
            leg_url = leg_a.get('href')
            district = tbl.xpath(
                './/span[contains(text(), "District:")]')[0].tail.lstrip('0')
            party = tbl.xpath('.//span[contains(text(), "Party:")]')[0].tail

            if party == 'Democrat':
                party = 'Democratic'

            # Create Person object
            person = Person(name=name,
                            district=district,
                            party=party,
                            primary_org='upper')
            person.add_link(leg_url)

            # Scrape office contact information from roster URL
            office_num = 1
            for addr in tbl.xpath('.//td[@headers]'):
                fax = phone = None
                lines = [addr.text]
                for child in addr.getchildren():
                    # when we get to span tag we just ingested a phone #
                    if child.tag == 'span' and child.text:
                        if 'TEL' in child.text:
                            phone = lines.pop()
                        elif 'FAX' in child.text:
                            fax = lines.pop()
                    elif child.tail:
                        lines.append(child.tail)

                address = '\n'.join(line.strip() for line in lines if line)
                if 'CAP' in addr.get('headers'):
                    office_name = 'Capitol Office #{}'.format(office_num)
                    office_num += 1
                else:
                    office_name = 'District Office'

                # Add office contact information to Person object
                if address:
                    person.add_contact_detail(type='address',
                                              value=address,
                                              note=office_name)
                if phone:
                    person.add_contact_detail(type='voice',
                                              value=phone,
                                              note=office_name)
                if fax:
                    person.add_contact_detail(type='fax',
                                              value=fax,
                                              note=office_name)

            # Add source links to Person object
            person.add_source(roster_url)
            person.add_source(leg_url)
            yield person
Пример #49
0
    def handle_list_item(self, item):
        link = item.xpath('.//div[contains(@class, "rep_style")]/a')[0]
        name = link.text_content().strip()

        if "Vacant" in name or "Resigned" in name or "Pending" in name:
            return

        party = item.xpath(
            './/div[contains(@class, "party_style")]/text()')[0].strip()
        party = {"D": "Democratic", "R": "Republican"}[party]

        district = item.xpath(
            './/div[contains(@class, "district_style")]/text()')[0].strip()

        leg_url = link.get("href")
        split_url = parse.urlsplit(leg_url)
        member_id = parse.parse_qs(split_url.query)["MemberId"][0]
        image = "http://www.flhouse.gov/FileStores/Web/Imaging/Member/{}.jpg".format(
            member_id)

        name = fix_name(name)
        rep = Person(
            name=name,
            district=district,
            party=party,
            primary_org="lower",
            role="Representative",
            image=image,
        )
        rep.add_link(leg_url)
        rep.add_source(leg_url)
        rep.add_source(self.url)

        self.scrape_page(RepDetail, leg_url, obj=rep)

        # look for email in the list from the PDF directory - ideally
        # we'd find a way to better index the source data which
        # wouldn't require guessing the email, but this does at least
        # confirm that it's correct

        # deal with some stuff that ends up in name that won't work in
        # email, spaces, quotes, high latin1
        email_name = rep.name.replace('"', "").replace("La ",
                                                       "La").replace("ñ", "n")
        (last, *other) = re.split(r"[-\s,]+", email_name)

        # deal with a missing nickname used in an email address
        if "Patricia" in other:
            other.append("Pat")

        # search through all possible first names and nicknames
        # present - needed for some of the more elaborate concoctions
        found_email = False
        for first in other:
            email = "*****@*****.**" % (first, last)
            if email in self.member_emails:
                # it's bad if we can't uniquely match emails, so throw an error
                if email in self.claimed_member_emails:
                    raise ValueError(
                        "Email address %s matches multiple reps - %s and %s." %
                        (email, rep.name, self.claimed_member_emails[email]))

                self.claimed_member_emails[email] = rep.name

                rep.add_contact_detail(type="email",
                                       value=email,
                                       note="Capitol Office")
                rep.add_source(self.directory_pdf_url)

                found_email = True

                break

        if not found_email:
            log.warning("Rep %s does not have an email in the directory PDF." %
                        (rep.name, ))

        return rep
Пример #50
0
    def _scrape_representative(self, url, parties):
        """
        Returns a Person object representing a member of the lower
        legislative chamber.
        """
        # url = self.get(url).text.replace('<br>', '')
        member_page = self.lxmlize(url)

        photo_url = member_page.xpath('//img[@class="member-photo"]/@src')[0]
        if photo_url.endswith('/.jpg'):
            photo_url = None

        scraped_name, district_text = member_page.xpath(
            '//div[@class="member-info"]/h2')
        scraped_name = scraped_name.text_content().strip().replace('Rep. ', '')
        scraped_name = ' '.join(scraped_name.split())

        name = ' '.join(scraped_name.split(', ')[::-1])

        district_text = district_text.text_content().strip()
        district = str(self.district_re.search(district_text).group(1))

        # Vacant house "members" are named after their district numbers:
        if re.match(r'^\d+$', scraped_name):
            yield None

        party = parties[district]

        person = Person(name=name,
                        district=district,
                        party=party,
                        primary_org='lower')

        if photo_url is not None:
            person.image = photo_url

        person.add_link(url)
        person.add_source(url)

        def office_name(element):
            """Returns the office address type."""
            return element.xpath('preceding-sibling::h4[1]/text()')[0] \
                .rstrip(':')

        offices_text = [{
            'name':
            office_name(p_tag),
            'type':
            office_name(p_tag).replace(' Address', '').lower(),
            'details':
            p_tag.text_content()
        } for p_tag in member_page.xpath(
            '//h4/following-sibling::p[@class="double-space"]')]

        for office_text in offices_text:
            details = office_text['details'].strip()

            # A few member pages have blank office listings:
            if details == '':
                continue

            # At the time of writing, this case of multiple district
            # offices occurs exactly once, for the representative at
            # District 43:
            if details.count('Office') > 1:
                district_offices = [
                    district_office.strip() for district_office in re.findall(
                        r'(\w+ Office.+?(?=\w+ Office|$))',
                        details,
                        flags=re.DOTALL)
                ]
                offices_text += [{
                    'name':
                    re.match(r'\w+ Office', office).group(),
                    'type':
                    'district',
                    'details':
                    re.search(r'(?<=Office).+(?=\w+ Office|$)?', office,
                              re.DOTALL).group()
                } for office in district_offices]

            match = self.address_re.search(details)
            if match is not None:
                address = re.sub(' +$',
                                 '',
                                 match.group().replace('\r', '').replace(
                                     '\n\n', '\n'),
                                 flags=re.MULTILINE)
            else:
                # No valid address found in the details.
                continue

            phone_number = extract_phone(details)
            fax_number = extract_fax(details)

            if address:
                person.add_contact_detail(type='address',
                                          value=address,
                                          note=office_text['name'])
            if phone_number:
                person.add_contact_detail(type='voice',
                                          value=phone_number,
                                          note=office_text['name'])
            if fax_number:
                person.add_contact_detail(type='fax',
                                          value=fax_number,
                                          note=office_text['name'])

        yield person
Пример #51
0
    def scrape_details(self, chamber, leg_name, leg_link, role):
        if not leg_link:
            # Vacant post, likely:
            if "Vacancy" in leg_name:
                return
            raise Exception("leg_link is null. something went wrong")
        try:
            url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link
            url_root = os.path.dirname(url)
            details_page = self.get(url)
            root = lxml.etree.fromstring(details_page.content)
            party = root.xpath('string(//PARTY)')

            district = root.xpath('string(//DISTRICT)')

            photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)'))

            home_phone = root.xpath('string(//H_PHONE)')

            home_address = root.xpath('string(//H_ADDRESS)')
            home_address2 = root.xpath('string(//H_ADDRESS2)')
            home_city = root.xpath('string(//H_CITY)')
            home_zip = root.xpath('string(//H_ZIP)')

            home_address_total = ''
            if home_address and home_city:
                if not home_address2:
                    home_address_total = "%s\n%s, MS %s" % (
                        home_address, home_city, home_zip)
                else:
                    home_address_total = "%s\n%s\n%s, MS %s" % (
                        home_address, home_address2, home_city, home_zip)

            # bis_phone = root.xpath('string(//B_PHONE)')
            capital_phone = root.xpath('string(//CAP_PHONE)')
            # other_phone = root.xpath('string(//OTH_PHONE)')
            org_info = root.xpath('string(//ORG_INFO)')
            email_name = root.xpath('string(//EMAIL_ADDRESS)').strip()
            cap_room = root.xpath('string(//CAP_ROOM)')

            if leg_name in ('Lataisha Jackson', 'John G. Faulkner'):
                assert not party, (
                    "Remove special-casing for this Democrat without a "
                    "listed party: {}").format(leg_name)
                party = 'Democratic'
            elif leg_name in ('James W. Mathis', 'John Glen Corley'):
                assert not party, (
                    "Remove special-casing for this Republican without"
                    " a listed party: {}").format(leg_name)
                party = 'Republican'
            elif party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'
            else:
                raise AssertionError(
                    "A member with no identifiable party was found: {}".format(
                        leg_name))
            leg = Person(primary_org=chamber,
                         district=district,
                         party=party,
                         image=photo,
                         name=leg_name,
                         role=role)
            leg.extras['org_info'] = org_info
            leg.add_source(url)
            leg.add_link(url)

            if email_name != "":
                if "@" in email_name:
                    email = email_name
                else:
                    email = '%s@%s.ms.gov' % (email_name, {
                        "upper": "senate",
                        "lower": "house"
                    }[chamber])
                leg.add_contact_detail(type='email',
                                       value=email,
                                       note='Capitol Office')

            if capital_phone != "":
                leg.add_contact_detail(type='voice',
                                       value=capital_phone,
                                       note='Capitol Office')

            if cap_room != "":
                address = "Room %s\n%s" % (cap_room, CAP_ADDRESS)
            else:
                address = CAP_ADDRESS
            leg.add_contact_detail(type='address',
                                   value=address,
                                   note='Capitol Office')

            if home_phone != "":
                leg.add_contact_detail(type='voice',
                                       value=home_phone,
                                       note='District Office')

            if home_address_total != "":
                leg.add_contact_detail(type='address',
                                       value=home_address_total,
                                       note='District Office')

            yield leg
        except scrapelib.HTTPError as e:
            self.warning(str(e))
Пример #52
0
    def scrape(self, session=None):
        if not session:
            session = self.jurisdiction.legislative_sessions[-1]["name"]
            self.info("no session specified, using %s", session)

        year_abr = session[0:4]

        self._init_mdb(int(year_abr))

        roster_csv = self.access_to_csv("Roster")
        bio_csv = self.access_to_csv("LegBio")

        photos = {}
        for rec in bio_csv:
            photos[rec["Roster Key"]] = rec["URLPicture"]

        for rec in roster_csv:
            first_name = rec["Firstname"]
            middle_name = rec["MidName"]
            last_name = rec["LastName"]
            suffix = rec["Suffix"]
            full_name = first_name + " " + middle_name + " " + last_name + " " + suffix
            full_name = full_name.replace("  ", " ")
            full_name = full_name[0:len(full_name) - 1]

            district = str(int(rec["District"]))
            party = rec["Party"]
            if party == "R":
                party = "Republican"
            elif party == "D":
                party = "Democratic"
            else:
                party = party
            chamber = rec["House"]
            if chamber == "A":
                chamber = "lower"
            elif chamber == "S":
                chamber = "upper"

            leg_status = rec["LegStatus"]
            # skip Deceased/Retired members
            if leg_status != "Active":
                continue
            phone = rec["Phone"] or None
            email = None
            if rec["Email"]:
                email = rec["Email"]

            # Email has been removed from the Access DB, but it's
            # still [email protected] and [email protected] - many
            # reps have these emails on their personal pages even if
            # they're gone from the DB file
            if not email:
                email = self._construct_email(chamber, rec["Sex"], last_name)

            try:
                photo_url = photos[rec["Roster Key"]]
            except KeyError:
                photo_url = ""
                self.warning("no photo url for %s", rec["Roster Key"])
            url = "http://www.njleg.state.nj.us/members/bio.asp?Leg=" + str(
                int(rec["Roster Key"]))
            address = "{0}\n{1}, {2} {3}".format(rec["Address"], rec["City"],
                                                 rec["State"], rec["Zipcode"])
            gender = {"M": "Male", "F": "Female"}[rec["Sex"]]

            person = Person(
                name=full_name,
                district=district,
                primary_org=chamber,
                party=party,
                image=photo_url,
                gender=gender,
            )

            person.add_link(url)
            person.add_source(url)
            person.add_source("http://www.njleg.state.nj.us/downloads.asp")

            person.add_contact_detail(type="address",
                                      value=address,
                                      note="District Office")
            if phone is not None:
                person.add_contact_detail(type="voice",
                                          value=phone,
                                          note="District Office")
            if email is not None:
                person.add_contact_detail(type="email",
                                          value=email,
                                          note="District Office")

            yield person
Пример #53
0
    def scrape_legislator(self, name, chamber, url, contact_page):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        party = page.xpath("string(//span[contains(@id, 'Party')])")
        party = party.strip()

        if party == 'Democrat':
            party = 'Democratic'

        district = page.xpath("string(//span[contains(@id, 'District')])")
        district = district.strip().lstrip('0')

        occupation = page.xpath(
            "string(//span[contains(@id, 'Occupation')])")
        occupation = occupation.strip()

        (photo_url, ) = page.xpath('//img[contains(@id, "_imgMember")]/@src')

        office_phone = page.xpath(
            "string(//span[contains(@id, 'CapitolPhone')])").strip()

        legislator = Person(primary_org=chamber,
                            image=photo_url,
                            name=name,
                            party=party,
                            district=district
                            )
        legislator.extras['occupation'] = occupation
        if office_phone.strip() != "":
            legislator.add_contact_detail(
                type='voice', value=office_phone, note='Capitol Office')

        # SD removed email from the detail pages but it's still in the
        # contact page, shared for all congress people
        member_id = re.search(r'Member=(\d+)', url).group(1)

        # find the profile block by finding a link inside it to their
        # detail page
        profile_link = contact_page.xpath(
            '//ul[@id="contact-list"]//a[contains(@href, "Member=%s")]' % (member_id,))
        if profile_link:
            # look for the adjacent email mailto link
            profile_link = profile_link[0]
            profile_block = profile_link.getparent().getparent().getparent()
            email_link = profile_block.xpath(
                './span/span/a[@class="mail-break"]')
            if email_link:
                email = email_link[0].text
                email = email.lstrip()
                email = email.rstrip()
                if email:
                    legislator.add_contact_detail(type='email',
                                                  value=email,
                                                  note='Capitol Office')
        home_address = [
            x.strip() for x in
            page.xpath('//td/span[contains(@id, "HomeAddress")]/text()')
            if x.strip()
        ]
        if home_address:
            home_address = "\n".join(home_address)
            home_phone = page.xpath(
                "string(//span[contains(@id, 'HomePhone')])").strip()
            legislator.add_contact_detail(type='address',
                                          value=home_address,
                                          note='District Office')
            if home_phone:
                legislator.add_contact_detail(type='voice',
                                              value=home_phone,
                                              note='District Office')

        legislator.add_source(url)
        legislator.add_link(url)

        committees = page.xpath(
            '//div[@id="divCommittees"]/span/section/table/tbody/tr/td/a')
        for committee in committees:
            self.scrape_committee(legislator, url, committee, chamber)
        yield legislator
Пример #54
0
    def scrape_legislators(self, url, chamber):
        data = self.get(url).text
        data = data.replace('"""', '"')  # weird triple quotes
        data = data.splitlines()

        fieldnames = ['last_name', 'first_name', 'party', 'district',
                      'address', 'city', 'state', 'zip']
        csv_parser = csv.DictReader(data, fieldnames)

        district_leg_urls = self._district_legislator_dict()

        # Toss the row headers.
        next(csv_parser)

        for entry in csv_parser:
            if not entry:
                continue

            # District.
            district = entry['district']
            hd_or_sd, district = district.split()

            # Party.
            party_letter = entry['party']
            party = {'D': 'Democratic', 'R': 'Republican'}[party_letter]

            # Get full name properly capped.
            fullname = '%s %s' % (entry['first_name'].title(),
                                  entry['last_name'].title())

            legislator = Person(name=fullname, primary_org=chamber, district=district,
                                party=party, image=entry.get('photo_url', ''))
            legislator.add_source(url)

            # Get any info at the legislator's detail_url.
            deets = {}
            try:
                detail_url = district_leg_urls[hd_or_sd][district]
                deets = self._scrape_details(detail_url)
            except KeyError:
                self.warning(
                    "Couldn't find legislator URL for district {} {}, likely retired; skipping"
                    .format(hd_or_sd, district)
                )
                continue
            except NoDetails:
                self.logger.warning("No details found at %r" % detail_url)
                continue
            else:
                legislator.add_source(detail_url)
                legislator.add_link(detail_url)

            # Get the office.
            address = '\n'.join([
                entry['address'],
                '%s, %s %s' % (entry['city'].title(), entry['state'], entry['zip'])
                ])
            legislator.add_contact_detail(type='address', value=address, note='District Office')

            phone = deets.get('phone')
            fax = deets.get('fax')
            email = deets.get('email')
            if phone:
                legislator.add_contact_detail(type='voice', value=phone, note='District Office')
            if fax:
                legislator.add_contact_detail(type='fax', value=fax, note='District Office')
            if email:
                legislator.add_contact_detail(type='email', value=email, note='District Office')

            yield legislator
Пример #55
0
    def scrape_legislator(self, chamber, url):
        # Initialize default values for legislator attributes.
        full_name = None
        party = None
        photo_url = None
        email = None
        capitol_address = None
        capitol_phone = None
        district = None
        district_address = None
        district_phone = None

        if chamber == 'upper':
            title_prefix = 'Senator '
        elif chamber == 'lower':
            title_prefix = 'Representative '
        else:
            title_prefix = ''

        santa_fe_area_code = '(505)'

        page = self.lxmlize(url)

        info_node = self.get_node(
            page, '//table[@id="MainContent_formViewLegislator"]')
        if info_node is None:
            raise ValueError('Could not locate legislator data.')

        district_node = self.get_node(
            info_node,
            './/a[@id="MainContent_formViewLegislator_linkDistrict"]')
        if district_node is not None:
            district = district_node.text.strip()

        name_node = self.get_node(
            page, './/span[@id="MainContent_formViewLegislatorName'
            '_lblLegislatorName"]')

        if name_node is not None:
            if name_node.text.strip().endswith(' Vacant'):
                self.warning(
                    'Found vacant seat for {} district {}; skipping'.format(
                        chamber, district))
                return

            n_head, n_sep, n_party = name_node.text.rpartition(' - ')

            full_name = re.sub(r'^{}'.format(title_prefix), '', n_head.strip())

            if '(D)' in n_party:
                party = 'Democratic'
            elif '(R)' in n_party:
                party = 'Republican'
            elif '(DTS)' in n_party:
                # decline to state = independent
                party = 'Independent'
            else:
                raise AssertionError('Unknown party {} for {}'.format(
                    party, full_name))

        photo_node = self.get_node(
            info_node,
            './/img[@id="MainContent_formViewLegislator_imgLegislator"]')
        if photo_node is not None:
            photo_url = photo_node.get('src')

        email_node = self.get_node(
            info_node, './/a[@id="MainContent_formViewLegislator_linkEmail"]')
        if email_node is not None and email_node.text:
            email = email_node.text.strip()

        capitol_address_node = self.get_node(
            info_node,
            './/span[@id="MainContent_formViewLegislator_lblCapitolRoom"]')
        if capitol_address_node is not None:
            capitol_address_text = capitol_address_node.text
            if capitol_address_text is not None:
                capitol_address = 'Room {} State Capitol\nSanta Fe, NM 87501'\
                    .format(capitol_address_text.strip())

        capitol_phone_node = self.get_node(
            info_node,
            './/span[@id="MainContent_formViewLegislator_lblCapitolPhone"]')
        if capitol_phone_node is not None:
            capitol_phone_text = capitol_phone_node.text
            if capitol_phone_text:
                capitol_phone_text = capitol_phone_text.strip()
                area_code, phone = extract_phone_number(capitol_phone_text)
                if phone:
                    capitol_phone = '{} {}'.format(
                        area_code.strip() if area_code else santa_fe_area_code,
                        phone)

        district_address_node = self.get_node(
            info_node,
            './/span[@id="MainContent_formViewLegislator_lblAddress"]')
        if district_address_node is not None:
            district_address = '\n'.join(district_address_node.xpath('text()'))

        office_phone_node = self.get_node(
            info_node,
            './/span[@id="MainContent_formViewLegislator_lblOfficePhone"]')

        home_phone_node = self.get_node(
            info_node,
            './/span[@id="MainContent_formViewLegislator_lblHomePhone"]')

        if office_phone_node is not None and office_phone_node.text:
            district_phone_text = office_phone_node.text
        elif home_phone_node is not None and home_phone_node.text:
            district_phone_text = home_phone_node.text
        else:
            district_phone_text = None
        if district_phone_text:
            d_area_code, d_phone = extract_phone_number(district_phone_text)
            district_phone = '{} {}'.format(d_area_code.strip(), d_phone)

        person = Person(name=full_name,
                        district=district,
                        party=party,
                        primary_org=chamber,
                        image=photo_url)
        if district_address:
            person.add_contact_detail(type='address',
                                      value=district_address,
                                      note='District Office')
        if district_phone:
            person.add_contact_detail(type='voice',
                                      value=district_phone,
                                      note='District Office')
        if capitol_address:
            person.add_contact_detail(type='address',
                                      value=capitol_address,
                                      note='Capitol Office')
        if capitol_phone:
            person.add_contact_detail(type='voice',
                                      value=capitol_phone,
                                      note='Capitol Office')
        if email:
            person.add_contact_detail(type='email',
                                      value=email,
                                      note='Capitol Office')

        person.add_link(url)
        person.add_source(url)

        yield person
Пример #56
0
    def scrape(self):
        body_types = self.body_types()

        city_council, = [body for body in self.bodies()
                         if body['BodyName'] == 'City Council']

        terms = collections.defaultdict(list)
        for office in self.body_offices(city_council):
            if 'VACAN' not in office['OfficeRecordFullName']:
                terms[office['OfficeRecordFullName'].strip()].append(office)

        web_scraper = LegistarPersonScraper(None,None)
        web_scraper.MEMBERLIST = 'https://chicago.legistar.com/DepartmentDetail.aspx?ID=12357&GUID=4B24D5A9-FED0-4015-9154-6BFFFB2A8CB4&R=8bcbe788-98cd-4040-9086-b34fa8e49881'
        web_scraper.ALL_MEMBERS = '3:3'

        web_info = {}
        for member, _ in web_scraper.councilMembers({'ctl00$ContentPlaceHolder$lstName' : 'City Council'}):
            web_info[member['Person Name']['label']] = member


        web_info['Balcer, James'] = collections.defaultdict(lambda : None)
        web_info['Fioretti, Bob'] = collections.defaultdict(lambda : None)
        web_info['Balcer, James']['Ward/Office'] = 11
        web_info['Fioretti, Bob']['Ward/Office'] = 2
        
        members = {}
        for member, offices in terms.items():
            web = web_info[member]
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']
                p.add_term('Alderman',
                           'legislature',
                           district = "Ward {}".format(int(web['Ward/Office'])),
                           start_date = self.toDate(term['OfficeRecordStartDate']),
                           end_date = self.toDate(term['OfficeRecordEndDate']))

            if web['Photo'] :
                p.image = web['Photo']

            contact_types = {
                "City Hall Office": ("address", "City Hall Office"),
                "City Hall Phone": ("voice", "City Hall Phone"),
                "Ward Office Phone": ("voice", "Ward Office Phone"),
                "Ward Office Address": ("address", "Ward Office Address"),
                "Fax": ("fax", "Fax")
            }

            for contact_type, (type_, _note) in contact_types.items():
                if web[contact_type] and web[contact_type] != 'N/A':
                    p.add_contact_detail(type=type_,
                                         value= web[contact_type],
                                         note=_note)

            if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != 'N/A':
                p.add_contact_detail(type="email",
                                     value=web['E-mail']['label'],
                                     note='E-mail')


            if web['Website']:
                p.add_link(web['Website']['url'])

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')


            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name' : 'Chicago City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web')

                for office in self.body_offices(body):
                    # messed up record for joanna thompson
                    if office['OfficeRecordId'] == 1055:
                        continue
                        
                    role = office['OfficeRecordTitle']
                    if role not in ("Vice Chair", "Chairman"):
                        role = 'Member'

                    person = office['OfficeRecordFullName'].strip()
                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)
                        
                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(body['BodyName'],
                                     role=role,
                                     start_date = self.toDate(office['OfficeRecordStartDate']),
                        
                                     end_date = self.toDate(office['OfficeRecordEndDate']))
                        

                yield o

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Joint Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name' : 'Chicago City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web')

                yield o        

        for p in members.values():
            yield p
    def scrape_chamber(self, chamber=None):
        metainf = self.scrape_leg_page(get_legislator_listing_url(chamber))
        for leg in metainf:
            try:
                chamber = {"House": "lower", "Senate": "upper"}[leg['chamber']]
            except KeyError:
                print("")
                print("  ERROR: Bad Legislator page.")
                print("    -> " + "\n    -> ".join(leg['source']))
                print("")
                print("  Added this workaround because of a bad legislator")
                print("  page, while they filled their info out.")
                print("")
                print("  Emailed webmaster. Told to wait.")
                print("   - PRT, Jun 23, 2014")
                print("")
                continue

            person = Person(name=leg['name'],
                            district=leg['district'],
                            party=leg['party'],
                            primary_org=chamber,
                            image=leg['image'])

            for source in leg['source']:
                person.add_source(source)

            try:
                for ctty in leg['ctty']:
                    flag = 'Joint Legislative'
                    if ctty['name'][:len(flag)] == flag:
                        ctty_chamber = "joint"
                    else:
                        ctty_chamber = chamber

                    comm = Organization(name=ctty['name'],
                                        classification="committee",
                                        chamber=ctty_chamber)
                    comm.add_member(person, role="member")

            except KeyError:
                self.warn("%s has no scraped Committees" % leg['name'])

            person.add_link(leg['homepage'])

            if leg['addr']:
                person.add_contact_detail(type='address',
                                          value=leg['addr'],
                                          note='Capitol Office')
            if leg['phone']:
                person.add_contact_detail(type='voice',
                                          value=leg['phone'],
                                          note='Capitol Office')
            if leg['email']:
                person.add_contact_detail(type='email',
                                          value=leg['email'],
                                          note='Capitol Office')
            if leg['fax']:
                person.add_contact_detail(type='fax',
                                          value=leg['fax'],
                                          note='Capitol Office')
            yield person
Пример #58
0
    def scrape_member(self, chamber, member_url):
        page = self.get(member_url).text
        root = lxml.html.fromstring(page)

        name_and_party = root.xpath(
            'string(//div[@class="col-md-12"]/h1[1])').split()

        title = name_and_party[0]
        # Account for Representative-Elect and Senator-Elect, for incoming class
        if title.startswith("Representative"):
            chamber = "lower"
        elif title.startswith("Senator"):
            chamber = "upper"

        full_name = " ".join(name_and_party[1:-1])

        party = name_and_party[-1]

        if party == "(R)":
            party = "Republican"
        elif party == "(D)":
            party = "Democratic"
        elif party == "(G)":
            party = "Green"
        elif party == "(I)":
            party = "Independent"
        elif "-Elect" in title and not party.startswith("("):
            self.warning("Member-elect is currently missing a party")
            full_name = " ".join(name_and_party[1:])
            party = ""
        else:
            raise AssertionError("Unknown party ({0}) for {1}".format(
                party, full_name))

        try:
            img = root.xpath('//img[@class="SitePhotos MemberPhoto"]')[0]
            photo_url = "https://www.arkleg.state.ar.us" + img.attrib["src"]
        except IndexError:
            self.warning("No member photo found")
            photo_url = ""

        # Need to figure out a cleaner method for this later
        # info_box = root.xpath('string(//div[@id="bodyContent"]/div[2]/div[2])')
        try:
            district = root.xpath(
                'string(//div[@id="bodyContent"]/div[2]/div[2]/div[3]/div[2])')
        except AttributeError:
            self.warning("Member has no district listed; skipping them")
            return

        person = Person(
            name=full_name,
            district=district,
            party=party,
            primary_org=chamber,
            image=photo_url,
        )

        person.add_link(member_url)
        person.add_source(member_url)

        try:
            phone = root.xpath(
                'string(//div[@id="bodyContent"]/div[2]/div[2]/div[1]/div[2]/a)'
            )
            if not phone.strip():
                raise AttributeError
        except AttributeError:
            phone = None
        try:
            email = root.xpath(
                'string(//div[@id="bodyContent"]/div[2]/div[2]/div[2]/div[2]/a)'
            )
            if not email.strip():
                raise AttributeError
        except AttributeError:
            email = None
        address = root.xpath(
            'string(//div[@id="bodyContent"]/div[1]/div[1]/p/b)')

        person.add_contact_detail(type="address",
                                  value=address,
                                  note="District Office")
        if phone is not None:
            person.add_contact_detail(type="voice",
                                      value=phone,
                                      note="District Office")
        if email is not None:
            person.add_contact_detail(type="email",
                                      value=email,
                                      note="District Office")

        try:
            occupation_check = root.xpath(
                'string(//div[@id="bodyContent"]/div[2]/div[2]/div[5]/div[1]/b)'
            )
            if occupation_check == "Occupation:":
                person.extras["occupation"] = root.xpath(
                    'string(//div[@id="bodyContent"]/div[2]/div[2]/div[5]/div[2])'
                )
            else:
                raise AttributeError
            if not person.extras["occupation"].strip():
                raise AttributeError
        except AttributeError:
            pass

        yield person
Пример #59
0
    def scrape_chamber(self, chamber, session):
        url = 'https://docs.legis.wisconsin.gov/{}/legislators/{}'.format(
            session,
            {
                'upper': 'senate',
                'lower': 'assembly'
            }[chamber],
        )

        body = self.get(url).text
        page = lxml.html.fromstring(body)
        page.make_links_absolute(url)

        for row in page.xpath(
                ".//div[@class='box-content']/div[starts-with(@id,'district')]"
        ):
            if row.xpath(
                    ".//a/@href") and not row.xpath(".//a[text()='Vacant']"):
                rep_url = row.xpath(".//a[text()='Details']/@href")[0].strip(
                    "https://")
                rep_url = "https://" + rep_url
                rep_doc = lxml.html.fromstring(self.get(rep_url).text)
                rep_doc.make_links_absolute(rep_url)

                full_name = rep_doc.xpath(
                    './/div[@id="district"]/h1/text()')[0].replace(
                        "Senator ", "").replace("Representative ", "")

                party = rep_doc.xpath('.//div[@id="district"]//small/text()')
                if len(party) > 0:
                    party = PARTY_DICT[party[0].split("-")[0].strip(
                        "(").strip()]
                else:
                    party = None
                district = rep_doc.xpath(
                    './/div[@id="district"]/h3/a/@href')[1]
                district = district.split("/")[-1]
                district = str(int(district))

                # email
                email = rep_doc.xpath("//span[@class='info email']/a/text()")
                if email:
                    email = email[0]
                else:
                    email = ''

                assert party is not None, "{} is missing party".format(
                    full_name)

                person = Person(
                    name=full_name,
                    district=district,
                    primary_org=chamber,
                    party=party,
                )

                img = rep_doc.xpath('.//div[@id="district"]/img/@src')
                if img:
                    person.image = img[0]

                # office ####
                address_lines = rep_doc.xpath(
                    './/span[@class="info office"]/text()')
                address = '\n'.join([
                    line.strip() for line in address_lines
                    if line.strip() != ""
                ])
                person.add_contact_detail(type='address',
                                          value=address,
                                          note='Capitol Office')

                phone = rep_doc.xpath(
                    './/span[@class="info telephone"]/text()')
                if phone:
                    phone = re.sub(r'\s+', ' ', phone[1]).strip()
                    person.add_contact_detail(type='voice',
                                              value=phone,
                                              note='Capitol Office')

                fax = rep_doc.xpath('.//span[@class="info fax"]/text()')
                if fax:
                    fax = re.sub(r'\s+', ' ', fax[1]).strip()
                    person.add_contact_detail(type='fax',
                                              value=fax,
                                              note='Capitol Office')

                if email:
                    person.add_contact_detail(type='email',
                                              value=email,
                                              note='Capitol Office')

                person.add_link(rep_url)
                person.add_source(rep_url)

                yield person
Пример #60
0
    def get_people(self):
        people_base_url = "http://miamidade.gov/wps/portal/Main/government"
        doc = self.lxmlize(people_base_url)
        person_list = doc.xpath("//div[contains(@id,'elected')]//span")
        titles = ["Chairman", "Vice Chair"]
        for person in person_list:
            info = person.text_content().strip().split("\r")
            position = info[0].strip()
            name = " ".join(info[1:-1])
            name = name.replace("Website | Contact", "")
            for title in titles:
                name = name.replace(title, "")
            name = name.strip()
            url = person.xpath(".//a[contains(text(),'Website')]/@href")[0]
            image = person.xpath(".//img/@src")[0]
            pers = Person(name=name,
                          image=image,
                          primary_org='legislature',
                          role=position)
            pers.add_source(people_base_url,
                            note="Miami-Dade government website")
            pers.add_source(url, note="individual's website")

            #the commissioners have consistent site format
            if "district" in position.lower():
                person_doc = self.lxmlize(url)
                contact_rows = person_doc.xpath(
                    "//div[@class='leftContentContainer']//p")
                for line in contact_rows:
                    line_text = line.text_content()
                    if "email" in line_text.lower():
                        email_address = line_text.replace("Email:", "").strip()
                        pers.add_contact_detail(type="email",
                                                value=email_address)
                        continue
                    try:
                        office, phone, fax = line_text.strip().split("\n")
                    except ValueError:
                        #ick, it's all on one line.
                        if "downtown office" in line_text.lower():
                            office = "Downtown Office"
                        elif "district office" in line_text.lower():
                            office = "District Office"
                        else:
                            continue
                        phone = line_text[15:27]
                        fax = line_text[33:45]

                    if "office" not in office.lower():
                        continue
                        #social is also available in here
                        #but I don't see a place to put it
                    phone = phone.replace("Phone", "").strip()
                    fax = fax.replace("Fax", "").strip()
                    pers.add_contact_detail(
                        type="voice",  #phone is not allowed ????
                        value=phone,
                        note=office.strip())

                    pers.add_contact_detail(
                        type="fax",  #phone is not allowed ????
                        value=fax,
                        note=office.strip())

            yield pers