Пример #1
0
    def handle_list_item(self, row):
        if not row['First Name']:
            return
        # This Dan Schoen hack very probably can be removed after April 2018.
        if row['First Name'] == 'Dan' and row['Last Name'] == 'Schoen':
            self.danSchoenSeen = True
            return
        name = '{} {}'.format(row['First Name'], row['Last Name'])
        party = PARTIES[row['Party']]
        leg = Person(name=name,
                     district=row['District'].lstrip('0'),
                     party=party,
                     primary_org='upper',
                     role='Senator',
                     image=self.extra_info[name]['image'])
        leg.add_link(self.extra_info[name]['url'])
        leg.add_contact_detail(type='voice',
                               value=self.extra_info[name]['office_phone'],
                               note='capitol')
        if 'email' in self.extra_info[name]:
            leg.add_contact_detail(type='email',
                                   value=self.extra_info[name]['email'],
                                   note='capitol')

        row['Zipcode'] = row['Zipcode'].strip()
        # Accommodate for multiple address column naming conventions.
        address1_fields = [row.get('Address'), row.get('Office Building')]
        address2_fields = [row.get('Address2'), row.get('Office Address')]
        row['Address'] = next((a for a in address1_fields if a is not None),
                              False)
        row['Address2'] = next((a for a in address2_fields if a is not None),
                               False)

        if (a in row['Address2'] for a in
            ['95 University Avenue W', '100 Rev. Dr. Martin Luther King']):
            address = (
                '{Address}\n{Address2}\n{City}, {State} {Zipcode}'.format(
                    **row))
            if 'Rm. Number' in row:
                address = '{0} {1}'.format(row['Rm. Number'], address)
            leg.add_contact_detail(type='address',
                                   value=address,
                                   note='capitol')
        elif row['Address2']:
            address = (
                '{Address}\n{Address2}\n{City}, {State} {Zipcode}'.format(
                    **row))
            leg.add_contact_detail(type='address',
                                   value=address,
                                   note='district')
        else:
            address = '{Address}\n{City}, {State} {Zipcode}'.format(**row)
            leg.add_contact_detail(type='address',
                                   value=address,
                                   note='district')

        leg.add_source(self.url)
        leg.add_source(self._html_url)

        return leg
Пример #2
0
    def scrape_senator_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
                "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), "
                "' portraitContainer ')]"):
            img = legislator.xpath(
                ".//div[@class='profileThumbnailBoundingBox']/@style")[0]
            img = img[img.find("(") + 1:img.find(")")]
            full_name = legislator.xpath(
                ".//div[@class='profileName']/a/text()")[0]
            homepage_url = legislator.xpath(
                ".//a[@class='profileImageLink']")[0].attrib["href"]
            district = legislator.xpath(".//div[@class='profileDistrict']"
                                        "/a/text()")[0].split("#")[1]

            if "Vacant" in full_name:
                continue

            homepage = self.get(homepage_url).text
            page = lxml.html.fromstring(homepage)
            phone = page.xpath("//div[@class='phone']/span/text()")[0]

            address_lines = page.xpath("//div[@class='address']/span/text()")
            address = "\n".join(address_lines)

            party_image = page.xpath(
                '//div[@class="senatorParty"]/img/@src')[0]
            if "Republican" in party_image:
                party = "Republican"
            elif "Democrat" in party_image:
                party = "Democratic"

            email = ("rep{0:0{width}}@ohiohouse.gov" if chamber == "lower" else
                     "sd{0:0{width}}@ohiosenate.gov").format(int(district),
                                                             width=2)

            leg = Person(
                name=full_name,
                district=district,
                primary_org=chamber,
                image=img,
                party=party,
            )

            leg.add_contact_detail(type="address",
                                   value=address,
                                   note="Capitol Office")
            leg.add_contact_detail(type="voice",
                                   value=phone,
                                   note="Capitol Office")
            leg.add_contact_detail(type="email",
                                   value=email,
                                   note="Capitol Office")

            leg.add_source(url)
            leg.add_link(homepage_url)
            yield leg
Пример #3
0
    def scrape_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
                "//div[contains(concat(' ', normalize-space(@class), ' '), "
                "' memberModule ')]"):
            img = legislator.xpath(
                ".//div[@class='thumbnail']//img")[0].attrib['src']
            data = legislator.xpath(".//div[@class='data']")[0]
            homepage = data.xpath(".//a[@class='black']")[0]
            full_name = homepage.text_content()

            if "Vacant" in full_name:
                continue

            homepage = homepage.attrib['href']
            party = data.xpath(
                ".//span[@class='partyLetter']")[0].text_content()
            party = {"R": "Republican", "D": "Democratic"}[party]
            office_lines = data.xpath("child::text()")
            phone = office_lines.pop(-1)
            office = "\n".join(office_lines)
            h3 = data.xpath("./h3")
            if len(h3):
                h3 = h3[0]
                district = h3.xpath("./br")[0].tail.replace("District",
                                                            "").strip()
            else:
                district = re.findall("\d+\.png",
                                      legislator.attrib['style'])[-1].split(
                                          ".", 1)[0]

            full_name = re.sub("\s+", " ", full_name).strip()
            email = ('rep{0:0{width}}@ohiohouse.gov' if chamber == 'lower' else
                     'sd{0:0{width}}@ohiosenate.gov').format(int(district),
                                                             width=2)

            leg = Person(name=full_name,
                         district=district,
                         party=party,
                         primary_org=chamber,
                         image=img)

            leg.add_contact_detail(type='address',
                                   value=office,
                                   note='Capitol Office')
            leg.add_contact_detail(type='voice',
                                   value=phone,
                                   note='Capitol Office')
            leg.add_contact_detail(type='email',
                                   value=email,
                                   note='Capitol Office')

            self.scrape_homepage(leg, chamber, homepage)

            leg.add_source(url)
            leg.add_link(homepage)
            yield leg
Пример #4
0
    def scrape_legislator(self, chamber, name, url):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        district = page.xpath('//h1[contains(., "DISTRICT")]/text()').pop() \
            .split()[1].strip().lstrip('0')

        party = page.xpath('//h2').pop().text_content()
        party = re.search(r'\((R|D|I)[ \-\]]', party).group(1)

        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'
        elif party == 'I':
            party = 'Independent'

        photo_url = page.xpath(
            "//img[contains(@src, 'images/members/')]")[0].attrib['src']

        leg = Person(name, district=district, party=party, image=photo_url, primary_org=chamber)
        leg.add_link(url)
        leg.add_source(url)
        self.scrape_offices(leg, page)

        yield leg
    def scrape_chamber(self, session):
        session_key = SESSION_KEYS[session]
        legislators_reponse = self.api_client.get('legislators', session=session_key)

        for legislator in legislators_reponse:
            url_name = legislator['WebSiteUrl'].split('/')[-1]
            chamber_name = 'house' if legislator['Chamber'] == 'H' else 'senate'
            img = 'https://www.oregonlegislature.gov/{}/MemberPhotos/{}.jpg'.format(
                chamber_name, url_name
            )

            party = legislator['Party']
            if party == 'Democrat':
                party = 'Democratic'

            person = Person(name='{} {}'.format(legislator['FirstName'], legislator['LastName']),
                            primary_org={'S': 'upper', 'H': 'lower'}[legislator['Chamber']],
                            party=party,
                            district=legislator['DistrictNumber'],
                            image=img)
            person.add_link(legislator['WebSiteUrl'])
            person.add_source(legislator['WebSiteUrl'])

            if legislator['CapitolAddress']:
                person.add_contact_detail(type='address', value=legislator['CapitolAddress'],
                                          note='Capitol Office')

            if legislator['CapitolPhone']:
                person.add_contact_detail(type='voice', value=legislator['CapitolPhone'],
                                          note='Capitol Office')

            person.add_contact_detail(type='email', value=legislator['EmailAddress'],
                                      note='Capitol Office')

            yield person
Пример #6
0
    def scrape_member(self, chamber, member_url):
        member_page = self.get(member_url).text
        doc = lxml.html.fromstring(member_page)

        photo_url = doc.xpath('//div[@id="bioImage"]/img/@src')[0]
        name_pieces = doc.xpath('//span[@id="name"]/text()')[0].split()
        full_name = ' '.join(name_pieces[1:-1]).strip()

        party = name_pieces[-1]
        if party == '(R)':
            party = 'Republican'
        elif party == '(D)':
            party = 'Democratic'
        elif party == '(I)':
            party = 'Independent'

        district = doc.xpath('//span[@id="districtHeader"]/text()')[0].split()[-1]

        person = Person(name=full_name, district=district, party=party,
                        primary_org=chamber, image=photo_url)
        person.add_source(member_url)
        person.add_link(member_url)

        address = '\n'.join(doc.xpath('//div[@id="FrankfortAddresses"]//'
                                      'span[@class="bioText"]/text()'))

        phone = None
        fax = None
        phone_numbers = doc.xpath('//div[@id="PhoneNumbers"]//span[@class="bioText"]/text()')
        for num in phone_numbers:
            if num.startswith('Annex: '):
                num = num.replace('Annex: ', '')
                if num.endswith(' (fax)'):
                    fax = num.replace(' (fax)', '')
                else:
                    phone = num

        emails = doc.xpath(
            '//div[@id="EmailAddresses"]//span[@class="bioText"]//a/text()'
        )
        email = reduce(
            lambda match, address: address if '@lrc.ky.gov' in str(address) else match,
            [None] + emails
        )

        if phone:
            person.add_contact_detail(type='voice', value=phone, note='Capitol Office')

        if fax:
            person.add_contact_detail(type='fax', value=fax, note='Capitol Office')

        if email:
            person.add_contact_detail(type='email', value=email, note='Capitol Office')

        if address.strip() == "":
            self.warning("Missing Capitol Office!!")
        else:
            person.add_contact_detail(type='address', value=address, note='Capitol Office')

        yield person
Пример #7
0
    def scrape_upper_chamber(self, term):
        url = "http://oksenate.gov/Senators/Default.aspx"
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for a in doc.xpath('//table[@summary]')[0]. \
                xpath('.//td//a[contains(@href, "biographies")]'):
            tail = a.xpath('..')[0].tail
            if tail:
                district = tail.split()[1]
            else:
                district = a.xpath('../../span')[1].text.split()[1]

            if a.text is None or a.text.strip() == 'Vacant':
                self.warning(
                    "District {} appears to be empty".format(district))
                continue
            else:
                match = re.match(r'(.+) \(([A-Z])\)', a.text.strip())
                name, party = match.group(1), self._parties[match.group(2)]

            url = a.get('href')

            person = Person(
                primary_org='upper',
                district=district,
                name=name.strip(),
                party=party,
            )
            person.add_link(url)
            person.add_source(url)
            self.scrape_upper_offices(person, url)
            yield person
Пример #8
0
    def scrape_member(self, chamber, link):
        name = link.text.strip()
        leg_url = link.get('href')
        district = link.xpath("string(../../td[3])")
        party = link.xpath("string(../../td[4])")

        # we get email on the next page now
        # email = link.xpath("string(../../td[5])")

        if party == 'Democrat':
            party = 'Democratic'
        elif party == 'No Party Specified':
            party = 'Independent'

        pid = re.search(r"personID=(\d+)", link.attrib['href']).group(1)
        photo_url = ("https://www.legis.iowa.gov/photo"
                     "?action=getPhoto&ga=%s&pid=%s" % (self.latest_session(), pid))

        leg = Person(
            name=name,
            primary_org=chamber,
            district=district,
            party=party,
            image=photo_url)

        leg.add_link(leg_url)
        leg.add_source(leg_url)

        leg_page = lxml.html.fromstring(self.get(link.attrib['href']).text)
        self.scrape_member_page(leg, leg_page)
        yield leg
Пример #9
0
    def handle_list_item(self, item):
        link = item.xpath('.//div[@class="rep_style"]/a')[0]
        name = link.text_content().strip()

        if 'Vacant' in name or 'Resigned' in name or 'Pending' in name:
            return

        party = item.xpath('.//div[@class="party_style"]/text()')[0].strip()
        party = {'D': 'Democratic', 'R': 'Republican'}[party]

        district = item.xpath(
            './/div[@class="district_style"]/text()')[0].strip()

        leg_url = link.get('href')
        split_url = parse.urlsplit(leg_url)
        member_id = parse.parse_qs(split_url.query)['MemberId'][0]
        image = "http://www.flhouse.gov/FileStores/Web/Imaging/Member/{}.jpg".format(
            member_id)

        rep = Person(name=name,
                     district=district,
                     party=party,
                     primary_org='lower',
                     role='Representative',
                     image=image)
        rep.add_link(leg_url)
        rep.add_source(leg_url)
        rep.add_source(self.url)

        self.scrape_page(RepDetail, leg_url, obj=rep)

        return rep
Пример #10
0
    def handle_list_item(self, item):
        name = " ".join(item.xpath('.//text()'))
        name = re.sub(r'\s+', " ", name).replace(" ,", ",").strip()

        if 'Vacant' in name:
            return

        district = item.xpath("string(../../td[1])")
        party = item.xpath("string(../../td[2])")
        if party == 'Democrat':
            party = 'Democratic'

        leg_url = item.get('href')

        leg = Person(name=name,
                     district=district,
                     party=party,
                     primary_org='upper',
                     role='Senator')
        leg.add_link(leg_url)
        leg.add_source(self.url)
        leg.add_source(leg_url)

        self.scrape_page(SenDetail, leg_url, obj=leg)

        return leg
Пример #11
0
    def handle_list_item(self, item):
        link = item.xpath('.//div[@class="rep_style"]/a')[0]
        name = link.text_content().strip()

        if 'Vacant' in name or 'Resigned' in name or 'Pending' in name:
            return

        party = item.xpath('.//div[@class="party_style"]/text()')[0].strip()
        party = {'D': 'Democratic', 'R': 'Republican'}[party]

        district = item.xpath('.//div[@class="district_style"]/text()')[0].strip()

        leg_url = link.get('href')
        split_url = parse.urlsplit(leg_url)
        member_id = parse.parse_qs(split_url.query)['MemberId'][0]
        image = "http://www.flhouse.gov/FileStores/Web/Imaging/Member/{}.jpg".format(member_id)

        rep = Person(name=name, district=district, party=party, primary_org='lower',
                     role='Representative', image=image)
        rep.add_link(leg_url)
        rep.add_source(leg_url)
        rep.add_source(self.url)

        self.scrape_page(RepDetail, leg_url, obj=rep)

        return rep
Пример #12
0
def test_full_person():
    person = ScrapePerson('Tom Sawyer')
    person.add_identifier('1')
    person.add_name('Tommy', start_date='1880')
    person.add_contact_detail(type='phone',
                              value='555-555-1234',
                              note='this is fake')
    person.add_link('http://example.com/link')
    person.add_source('http://example.com/source')

    # import person
    pd = person.as_dict()
    PersonImporter('jurisdiction-id').import_data([pd])

    # get person from db and assert it imported correctly
    p = Person.objects.get()
    assert 'ocd-person' in p.id
    assert p.name == person.name

    assert p.identifiers.all()[0].identifier == '1'
    assert p.identifiers.all()[0].scheme == ''

    assert p.other_names.all()[0].name == 'Tommy'
    assert p.other_names.all()[0].start_date == '1880'

    assert p.contact_details.all()[0].type == 'phone'
    assert p.contact_details.all()[0].value == '555-555-1234'
    assert p.contact_details.all()[0].note == 'this is fake'

    assert p.links.all()[0].url == 'http://example.com/link'
    assert p.sources.all()[0].url == 'http://example.com/source'
Пример #13
0
    def handle_list_item(self, item):
        photo_url = item.xpath('./img/@src')[0]
        url = item.xpath('.//h5/a/@href')[0]
        name_text = item.xpath('.//h5/a/b/text()')[0]

        name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip('0').upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [x.strip() for x in item.xpath(
            './div/text()[normalize-space()]'
        ) if x.strip()]
        address = '\n'.join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_text = item.xpath('.//a/@href')[1].replace('mailto:', '').strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(name=name, district=district, party=party,
                     primary_org='lower', role='Representative',
                     image=photo_url)
        rep.add_link(url)
        rep.add_contact_detail(type='address', value=address, note='capitol')
        rep.add_contact_detail(type='voice', value=phone, note='capitol')
        rep.add_contact_detail(type='email', value=email, note='capitol')
        rep.add_source(self.url)

        yield rep
Пример #14
0
    def handle_list_item(self, item):
        name = " ".join(item.xpath(".//text()"))
        name = re.sub(r"\s+", " ", name).replace(" ,", ",").strip()

        if "Vacant" in name:
            return

        district = item.xpath("string(../../td[1])")
        party = item.xpath("string(../../td[2])")
        if party == "Democrat":
            party = "Democratic"

        leg_url = item.get("href")

        name = fix_name(name)
        leg = Person(
            name=name,
            district=district,
            party=party,
            primary_org="upper",
            role="Senator",
        )
        leg.add_link(leg_url)
        leg.add_source(self.url)
        leg.add_source(leg_url)

        self.scrape_page(SenDetail, leg_url, obj=leg)

        return leg
Пример #15
0
def test_full_person():
    person = ScrapePerson('Tom Sawyer')
    person.add_identifier('1')
    person.add_name('Tommy', start_date='1880')
    person.add_contact_detail(type='phone', value='555-555-1234', note='this is fake')
    person.add_link('http://example.com/link')
    person.add_source('http://example.com/source')

    # import person
    pd = person.as_dict()
    PersonImporter('jurisdiction-id').import_data([pd])

    # get person from db and assert it imported correctly
    p = Person.objects.get()
    assert 'ocd-person' in p.id
    assert p.name == person.name

    assert p.identifiers.all()[0].identifier == '1'
    assert p.identifiers.all()[0].scheme == ''

    assert p.other_names.all()[0].name == 'Tommy'
    assert p.other_names.all()[0].start_date == '1880'

    assert p.contact_details.all()[0].type == 'phone'
    assert p.contact_details.all()[0].value == '555-555-1234'
    assert p.contact_details.all()[0].note == 'this is fake'

    assert p.links.all()[0].url == 'http://example.com/link'
    assert p.sources.all()[0].url == 'http://example.com/source'
Пример #16
0
    def scrape(self):
        committee_d = {}

        for councilman, committees in self.councilMembers() :

            p = Person(' '.join((councilman['First name'], councilman['Last name']))) 
            if p.name == 'Toni Preckwinkle' :
                continue
            elif p.name == 'Robert Steele' :
                district = 2
            elif p.name == 'Jerry Butler' :
                district = 3
            elif p.name == 'Sean Morrison' :
                district = 17
            else :
                district = re.findall('\d+', councilman['Person Name']['url'])[0]

            start_date = self.toTime(councilman['Start Date']).date()
            end_date = self.toTime(councilman['End Date']).date()

            if end_date == datetime.date(2018, 12, 2) :
                end_date = ''
            else :
                end_date = end_date.isoformat()

            p.add_term('Commissioner', 'legislature', 
                       district='District {}'.format(district), 
                       start_date=start_date.isoformat(),
                       end_date=end_date)

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['url'],
                                     note='E-mail')

            if councilman['Web site']:
                p.add_link(councilman['Web site']['url'], note='web site')


            p.add_source(councilman['Person Name']['url'])

            for committee, _, _ in committees:
                committee_name = committee['Department Name']['label']

                if 'committee' in committee_name.lower() :
                    o = committee_d.get(committee_name, 
                                        None)
                    if o is None:
                        o = Organization(committee_name,
                                         classification='committee',
                                         parent_id={'name' : 'Cook County Board of Commissioners'})
                        o.add_source(committee['Department Name']['url'])
                        committee_d[committee_name] = o

                    membership = o.add_member(p, role=committee["Title"])
                    membership.start_date = self.mdY2Ymd(committee["Start Date"])
            yield p

        for o in committee_d.values() :
            yield o
Пример #17
0
    def scrape_legislator(self, chamber, name, url):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        district = page.xpath('//h1[contains(., "DISTRICT")]/text()').pop() \
            .split()[1].strip().lstrip('0')

        party = page.xpath('//h2').pop().text_content()
        party = re.search(r'\((R|D|I)[ \-\]]', party).group(1)

        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'
        elif party == 'I':
            party = 'Independent'

        photo_url = page.xpath(
            "//img[contains(@src, 'images/members/')]")[0].attrib['src']

        leg = Person(name,
                     district=district,
                     party=party,
                     image=photo_url,
                     primary_org=chamber)
        leg.add_link(url)
        leg.add_source(url)
        self.scrape_offices(leg, page)

        yield leg
Пример #18
0
    def scrape_member(self, chamber, link):
        name = link.text.strip()
        leg_url = link.get("href")
        district = link.xpath("string(../../td[3])")
        party = link.xpath("string(../../td[4])")

        # we get email on the next page now
        # email = link.xpath("string(../../td[5])")

        if party == "Democrat":
            party = "Democratic"
        elif party == "No Party Specified":
            party = "Independent"

        pid = re.search(r"personID=(\d+)", link.attrib["href"]).group(1)
        photo_url = ("https://www.legis.iowa.gov/photo"
                     "?action=getPhoto&ga=%s&pid=%s" %
                     (self.latest_session(), pid))

        leg = Person(
            name=name,
            primary_org=chamber,
            district=district,
            party=party,
            image=photo_url,
        )

        leg.add_link(leg_url)
        leg.add_source(leg_url)

        leg_page = lxml.html.fromstring(self.get(link.attrib["href"]).text)
        self.scrape_member_page(leg, leg_page)
        yield leg
Пример #19
0
    def scrape_member(self, chamber, member_url):
        member_page = self.get(member_url).text
        doc = lxml.html.fromstring(member_page)

        photo_url = doc.xpath('//div[@id="bioImage"]/img/@src')[0]
        name_pieces = doc.xpath('//span[@id="name"]/text()')[0].split()
        full_name = ' '.join(name_pieces[1:-1]).strip()

        party = name_pieces[-1]
        if party == '(R)':
            party = 'Republican'
        elif party == '(D)':
            party = 'Democratic'
        elif party == '(I)':
            party = 'Independent'

        district = doc.xpath('//span[@id="districtHeader"]/text()')[0].split()[-1]

        person = Person(name=full_name, district=district, party=party,
                        primary_org=chamber, image=photo_url)
        person.add_source(member_url)
        person.add_link(member_url)

        address = '\n'.join(doc.xpath('//div[@id="FrankfortAddresses"]//'
                                      'span[@class="bioText"]/text()'))

        phone = None
        fax = None
        phone_numbers = doc.xpath('//div[@id="PhoneNumbers"]//span[@class="bioText"]/text()')
        for num in phone_numbers:
            if num.startswith('Annex: '):
                num = num.replace('Annex: ', '')
                if num.endswith(' (fax)'):
                    fax = num.replace(' (fax)', '')
                else:
                    phone = num

        emails = doc.xpath(
            '//div[@id="EmailAddresses"]//span[@class="bioText"]//a/text()'
        )
        email = reduce(
            lambda match, address: address if '@lrc.ky.gov' in str(address) else match,
            [None] + emails
        )

        if phone:
            person.add_contact_detail(type='voice', value=phone, note='Capitol Office')

        if fax:
            person.add_contact_detail(type='fax', value=fax, note='Capitol Office')

        if email:
            person.add_contact_detail(type='email', value=email, note='Capitol Office')

        if address.strip() == "":
            self.warning("Missing Capitol Office!!")
        else:
            person.add_contact_detail(type='address', value=address, note='Capitol Office')

        yield person
Пример #20
0
    def scrape_upper_chamber(self, term):
        url = "http://oksenate.gov/Senators/Default.aspx"
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for a in doc.xpath('//table[@summary]')[0]. \
                xpath('.//td//a[contains(@href, "biographies")]'):
            tail = a.xpath('..')[0].tail
            if tail:
                district = tail.split()[1]
            else:
                district = a.xpath('../../span')[1].text.split()[1]

            if a.text is None or a.text.strip() == 'Vacant':
                self.warning("District {} appears to be empty".format(district))
                continue
            else:
                match = re.match(r'(.+) \(([A-Z])\)', a.text.strip())
                name, party = match.group(1), self._parties[match.group(2)]

            url = a.get('href')

            person = Person(primary_org='upper',
                            district=district,
                            name=name.strip(),
                            party=party,
                            )
            person.add_link(url)
            person.add_source(url)
            self.scrape_upper_offices(person, url)
            yield person
Пример #21
0
    def bos_scrape_people(self):
        page = self.lxmlize(MEMBER_LIST)
        people = page.xpath(
            "//table[@width='100%']//td[@style='TEXT-ALIGN: center']")

        for person in people:
            image, name = [self.get_one(person, x) for x in [
                ".//img",
                ".//a[contains(@href, 'councillors') and (text()!='')]"
            ]]
            role = person.xpath(".//br")[0].tail.strip()
            image = image.attrib['src']  # Fallback if we don't get one from the
            # homepage.
            homepage = name.attrib['href']
            name = clean_name(name.text)
            info = self.scrape_homepage(homepage)
            if info.get('image', None):
                image = info['image']

            p = Person(name=name, district=role, image=image,
                       primary_org="legislature", biography=info['bio'])
            p.add_link(url=homepage, note='homepage')
            p.add_source(homepage)
            p.add_source(MEMBER_LIST)
            yield p
Пример #22
0
    def scrape_upper_chamber(self, term):
        url = 'https://senado.pr.gov/Pages/Senadores.aspx'

        doc = self.lxmlize(url)
        links = self.get_nodes(doc, '//ul[@class="senadores-list"]/li/a/@href')

        for link in links:
            senator_page = self.lxmlize(link)
            profile_links = self.get_nodes(senator_page,
                                           '//ul[@class="profiles-links"]/li')

            name_text = self.get_node(
                senator_page, '//span[@class="name"]').text_content().strip()
            # Convert to title case as some names are in all-caps
            name = re.sub(r'^Hon\.', '', name_text,
                          flags=re.IGNORECASE).strip().title()
            party = profile_links[0].text_content().strip()
            # Translate to English since being an Independent is a universal construct
            if party == "Independiente":
                party = "Independent"

            photo_url = self.get_node(senator_page,
                                      '//div[@class="avatar"]//img/@src')

            if profile_links[1].text_content().strip(
            ) == "Senador por Distrito":
                district_text = self.get_node(
                    senator_page,
                    '//div[@class="module-distrito"]//span[@class="headline"]'
                ).text_content()
                district = district_text.replace('DISTRITO', '',
                                                 1).replace('\u200b',
                                                            '').strip()
            elif profile_links[1].text_content().strip(
            ) == "Senador por Acumulación":
                district = "At-Large"

            phone_node = self.get_node(senator_page,
                                       '//a[@class="contact-data tel"]')
            phone = phone_node.text_content().strip()
            email_node = self.get_node(senator_page,
                                       '//a[@class="contact-data email"]')
            email = email_node.text_content().replace('\u200b', '').strip()

            person = Person(primary_org='upper',
                            district=district,
                            name=name,
                            party=party,
                            image=photo_url)
            person.add_contact_detail(type='email',
                                      value=email,
                                      note='Capitol Office')
            person.add_contact_detail(type='voice',
                                      value=phone,
                                      note='Capitol Office')
            person.add_link(link)
            person.add_source(link)

            yield person
Пример #23
0
    def scrape_chamber(self, chamber):
        self._party_map = {
            'Democrat': 'Democratic',
            'Republican': 'Republican',
            'Non Affiliated': 'Independent',
            'Not Affiliated': 'Independent',
        }

        if chamber == 'upper':
            url = 'http://senate.legis.state.ak.us/'
        else:
            url = 'http://house.legis.state.ak.us/'

        page = self.lxmlize(url)

        items = page.xpath('//ul[@class="item"]')[1].getchildren()

        for item in items:
            photo_url = item.xpath('.//img/@src')[0]
            name = item.xpath('.//strong/text()')[0]
            leg_url = item.xpath('.//a/@href')[0]
            email = item.xpath('.//a[text()="Email Me"]/@href')
            if email:
                email = email[0].replace('mailto:', '')
            else:
                self.warning('no email for ' + name)

            party = district = None
            skip = False

            for dt in item.xpath('.//dt'):
                dd = dt.xpath('following-sibling::dd')[0].text_content()
                label = dt.text.strip()
                if label == 'Party:':
                    party = dd
                elif label == 'District:':
                    district = dd
                elif label.startswith('Deceased'):
                    skip = True
                    self.warning('skipping deceased ' + name)
                    break

            if skip:
                continue

            person = Person(
                primary_org=chamber,
                district=district,
                name=name,
                party=self._party_map[party],
                image=photo_url,
            )
            person.add_source(leg_url)
            person.add_link(leg_url)

            # scrape offices
            self._scrape_offices(person, leg_url, email)

            yield person
Пример #24
0
    def scrape_chamber(self, chamber):
        self._party_map = {
            'Democrat': 'Democratic',
            'Republican': 'Republican',
            'Non Affiliated': 'Independent',
            'Not Affiliated': 'Independent',
        }

        if chamber == 'upper':
            url = 'http://senate.legis.state.ak.us/'
        else:
            url = 'http://house.legis.state.ak.us/'

        page = self.lxmlize(url)

        items = page.xpath('//ul[@class="item"]')[1].getchildren()

        for item in items:
            photo_url = item.xpath('.//img/@src')[0]
            name = item.xpath('.//strong/text()')[0]
            leg_url = item.xpath('.//a/@href')[0]
            email = item.xpath('.//a[text()="Email Me"]/@href')
            if email:
                email = email[0].replace('mailto:', '')
            else:
                self.warning('no email for ' + name)

            party = district = None
            skip = False

            for dt in item.xpath('.//dt'):
                dd = dt.xpath('following-sibling::dd')[0].text_content()
                label = dt.text.strip()
                if label == 'Party:':
                    party = dd
                elif label == 'District:':
                    district = dd
                elif label.startswith('Deceased'):
                    skip = True
                    self.warning('skipping deceased ' + name)
                    break

            if skip:
                continue

            person = Person(
                primary_org=chamber,
                district=district,
                name=name,
                party=self._party_map[party],
                image=photo_url,
            )
            person.add_source(leg_url)
            person.add_link(leg_url)

            # scrape offices
            self._scrape_offices(person, leg_url, email)

            yield person
Пример #25
0
    def scrape_chamber(self, chamber, session):

        if chamber == 'upper':
            chamber_slug = 'Senate'
        elif chamber == 'lower':
            chamber_slug = 'Assembly'
        session_slug = self.jurisdiction.session_slugs[session]

        leg_base_url = 'http://www.leg.state.nv.us/App/Legislator/A/%s/%s/' % (chamber_slug,
                                                                               session_slug)
        leg_json_url = ('http://www.leg.state.nv.us/App/Legislator/A/api/%s/Legislator?house=%s' %
                        (session_slug, chamber_slug))

        resp = json.loads(self.get(leg_json_url).text)
        for item in resp:
            # empty district
            empty_names = ['District No', 'Vacant']
            if any(name in item['FullName'] for name in empty_names):
                continue
            last, first = item['FullName'].split(",", 1)
            item['FullName'] = "{first} {last}".format(last=last,
                                                       first=first).strip()
            person = Person(name=item['FullName'], district=item['DistrictNbr'],
                            party=item['Party'], primary_org=chamber,
                            image=item['PhotoURL'])
            leg_url = leg_base_url + item['DistrictNbr']

            # hack to get the legislator ID
            html = self.get(leg_url).text
            for l in html.split('\n'):
                if 'GetLegislatorDetails' in l:
                    leg_id = l.split(',')[1].split("'")[1]

            # fetch the json used by the page
            leg_details_url = ('https://www.leg.state.nv.us/App/Legislator/A/api/{}/Legislator?id='
                               .format(session_slug) + leg_id)
            leg_resp = json.loads(self.get(leg_details_url).text)
            details = leg_resp['legislatorDetails']

            address = details['Address1']
            address2 = details['Address2']
            if address2:
                address += ' ' + address2
            address += '\n%s, NV %s' % (details['City'], details['Zip'])

            phone = details['LCBPhone']
            email = details['LCBEmail']
            if address:
                person.add_contact_detail(type='address', value=address,
                                          note='District Office')
            if phone:
                person.add_contact_detail(type='voice', value=phone,
                                          note='District Office')
            if phone:
                person.add_contact_detail(type='email', value=email,
                                          note='District Office')
            person.add_link(leg_details_url)
            person.add_source(leg_details_url)
            yield person
Пример #26
0
def test_invalid_fields_related_item():
    p1 = ScrapePerson('Dwayne')
    p1.add_link('http://example.com')
    p1 = p1.as_dict()
    p1['links'][0]['test'] = 3

    with pytest.raises(DataImportError):
        PersonImporter('jid').import_data([p1])
Пример #27
0
def test_invalid_fields_related_item():
    p1 = ScrapePerson('Dwayne')
    p1.add_link('http://example.com')
    p1 = p1.as_dict()
    p1['links'][0]['test'] = 3

    with pytest.raises(DataImportError):
        PersonImporter('jid').import_data([p1])
Пример #28
0
    def scrape_chamber(self, chamber):
        """
        Scrapes legislators for the current term only
        """
        # self.validate_term(term, latest_only=True)
        url = BASE_URL % CHAMBERS[chamber].lower()
        index = self.get(url).text
        html = lxml.html.fromstring(index)
        html.make_links_absolute(url)

        rows = html.xpath('//div[contains(@class, "row-equal-height")]')

        for row in rows:
            img_url = row.xpath('.//img/@src')[0]

            inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1]
            inner_text = inner.text_content()
            if 'Resigned' in inner_text or 'Substitute' in inner_text:
                continue

            name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip()
            name = re.sub(r'\s+', ' ', name)
            party = PARTY[inner.xpath('p/strong')[0].tail.strip()]
            email = inner.xpath('p/strong/a')[0].text
            district = inner.xpath('p/a')[0].text.replace('District ', '')

            person_url = inner.xpath('p/a/@href')[0]
            # skip roles for now
            role = ''
            # for com in inner.xpath('p/a[contains(@href, "committees")]'):
            #     role = com.tail.strip()

            person = Person(name=name, district=district,
                            party=party, primary_org=chamber,
                            image=img_url, role=role)
            phones = get_phones(inner)
            phone = phones.get('home') or phones.get('business')
            office_phone = phones.get('office')
            address = get_address(inner)
            fax = get_fax(inner)
            if address:
                person.add_contact_detail(type='address', value=address,
                                          note='District Office')
            if phone:
                person.add_contact_detail(type='voice', value=phone,
                                          note='District Office')
            if fax:
                person.add_contact_detail(type='fax', value=fax,
                                          note='District Office')
            if email:
                person.add_contact_detail(type='email', value=email,
                                          note='District Office')
            if office_phone:
                person.add_contact_detail(type='voice', value=office_phone,
                                          note='Capitol Office')
            person.add_source(url)
            person.add_link(person_url)
            yield person
Пример #29
0
    def scrape_chamber(self, chamber):
        client = ApiClient(self)
        session = self.latest_session()
        base_url = "http://iga.in.gov/legislative"
        api_base_url = "https://api.iga.in.gov"
        chamber_name = "senate" if chamber == "upper" else "house"
        r = client.get("chamber_legislators",
                       session=session,
                       chamber=chamber_name)
        all_pages = client.unpaginate(r)
        for leg in all_pages:
            firstname = leg["firstName"]
            lastname = leg["lastName"]
            party = leg["party"]
            link = leg["link"]
            api_link = api_base_url + link
            html_link = base_url + link.replace("legislators/",
                                                "legislators/legislator_")
            try:
                html = get_with_increasing_timeout(self,
                                                   html_link,
                                                   fail=True,
                                                   kwargs={"verify": False})
            except scrapelib.HTTPError:
                self.logger.warning("Legislator's page is not available.")
                continue
            doc = lxml.html.fromstring(html.text)
            doc.make_links_absolute(html_link)
            address, phone = doc.xpath("//address")
            address = address.text_content().strip()
            address = "\n".join([l.strip() for l in address.split("\n")])
            phone = phone.text_content().strip()
            try:
                district = (doc.xpath("//span[@class='district-heading']")
                            [0].text.lower().replace("district", "").strip())
            except IndexError:
                self.warning("skipping legislator w/o district")
                continue
            image_link = base_url + link.replace("legislators/",
                                                 "portraits/legislator_")
            legislator = Person(
                primary_org=chamber,
                district=district,
                name=" ".join([firstname, lastname]),
                party=party,
                image=image_link,
            )
            legislator.add_contact_detail(type="address",
                                          note="Capitol Office",
                                          value=address)
            legislator.add_contact_detail(type="voice",
                                          note="Capitol Office",
                                          value=phone)
            legislator.add_link(html_link)
            legislator.add_source(html_link)
            legislator.add_source(api_link)

            yield legislator
Пример #30
0
    def scrape_chamber(self, chamber):
        """
        Scrapes legislators for the current term only
        """
        # self.validate_term(term, latest_only=True)
        url = BASE_URL % CHAMBERS[chamber].lower()
        index = self.get(url, verify=False).text
        html = lxml.html.fromstring(index)
        html.make_links_absolute(url)

        rows = html.xpath('//div[contains(@class, "row-equal-height")]')

        for row in rows:
            img_url = row.xpath('.//img/@src')[0]

            inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1]
            inner_text = inner.text_content()
            if 'Resigned' in inner_text or 'Substitute' in inner_text:
                continue

            name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip()
            name = re.sub('\s+', ' ', name)
            party = PARTY[inner.xpath('p/strong')[0].tail.strip()]
            email = inner.xpath('p/strong/a')[0].text
            district = inner.xpath('p/a')[0].text.replace('District ', '')

            person_url = inner.xpath('p/a/@href')[0]
            # skip roles for now
            role = ''
            # for com in inner.xpath('p/a[contains(@href, "committees")]'):
            #     role = com.tail.strip()

            person = Person(name=name, district=district,
                            party=party, primary_org=chamber,
                            image=img_url, role=role)
            phones = get_phones(inner)
            phone = phones.get('home') or phones.get('business')
            office_phone = phones.get('office')
            address = get_address(inner)
            fax = get_fax(inner)
            if address:
                person.add_contact_detail(type='address', value=address,
                                          note='District Office')
            if phone:
                person.add_contact_detail(type='voice', value=phone,
                                          note='District Office')
            if fax:
                person.add_contact_detail(type='fax', value=fax,
                                          note='District Office')
            if email:
                person.add_contact_detail(type='email', value=email,
                                          note='District Office')
            if office_phone:
                person.add_contact_detail(type='voice', value=office_phone,
                                          note='Capitol Office')
            person.add_source(url)
            person.add_link(person_url)
            yield person
Пример #31
0
    def scrape_lower(self, chamber):
        url = 'http://www.house.mi.gov/mhrpublic/frmRepList.aspx'
        table = [
            "website",
            "district",
            "name",
            "party",
            "location",
            "phone",
            "email"
        ]

        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        # skip two rows at top
        for row in doc.xpath('//table[@id="grvRepInfo"]/*'):
            tds = row.xpath('.//td')
            if len(tds) == 0:
                continue
            metainf = {}
            for i in range(0, len(table)):
                metainf[table[i]] = tds[i]
            district = str(int(metainf['district'].text_content().strip()))
            party = metainf['party'].text_content().strip()
            phone = metainf['phone'].text_content().strip()
            email = metainf['email'].text_content().strip()
            leg_url = metainf['website'].xpath("./a")[0].attrib['href']
            name = metainf['name'].text_content().strip()
            if name == 'Vacant' or re.match(r'^District \d{1,3}$', name):
                self.warning('District {} appears vacant, and will be skipped'.format(district))
                continue

            office = metainf['location'].text_content().strip()
            office = re.sub(
                ' HOB',
                ' Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933',
                office
            )
            office = re.sub(
                ' CB',
                ' State Capitol Building\nLansing, MI 48909',
                office
            )

            photo_url = self.get_photo_url(leg_url)
            person = Person(name=name, district=district, party=abbr[party],
                            primary_org='lower', image=photo_url[0] if photo_url else None)

            person.add_link(leg_url)
            person.add_source(leg_url)

            person.add_contact_detail(type='address', value=office, note='Capitol Office')
            person.add_contact_detail(type='voice', value=phone, note='Capitol Office')
            person.add_contact_detail(type='email', value=email, note='Capitol Office')

            yield person
Пример #32
0
    def get_member(self, session, chamber, kpid):
        url = "%smembers/%s" % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)["content"]

        party = content["PARTY"]
        if party == "Democrat":
            party = "Democratic"

        slug = {
            "2013-2014": "b2013_14",
            "2015-2016": "b2015_16",
            "2017-2018": "b2017_18",
            "2019-2020": "b2019_20",
        }[session]
        leg_url = "http://www.kslegislature.org/li/%s/members/%s/" % (slug,
                                                                      kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            (photo_url,
             ) = legislator_page.xpath('//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(
                content["FULLNAME"]))
            leg_url = ""
            photo_url = ""

        person = Person(
            name=content["FULLNAME"],
            district=str(content["DISTRICT"]),
            primary_org=chamber,
            party=party,
            image=photo_url,
        )
        person.extras = {"occupation": content["OCCUPATION"]}

        address = "\n".join([
            "Room {}".format(content["OFFICENUM"]),
            "Kansas State Capitol Building",
            "300 SW 10th St.",
            "Topeka, KS 66612",
        ])

        note = "Capitol Office"
        person.add_contact_detail(type="address", value=address, note=note)
        person.add_contact_detail(type="email",
                                  value=content["EMAIL"],
                                  note=note)
        if content["OFFPH"]:
            person.add_contact_detail(type="voice",
                                      value=content["OFFPH"],
                                      note=note)

        person.add_source(url)
        person.add_link(leg_url)

        yield person
Пример #33
0
    def _scrape_legislator(self, row, chamber):
        name_cell = row.xpath('./td[@class="rosterCell nameCell"]/a')[0]
        name = " ".join([
            line.strip() for line in name_cell.text_content().split("\n")
            if len(line.strip()) > 0
        ])

        party_letter = row.xpath(
            './td[@class="rosterCell partyCell"]/text()')[0].strip()
        party = dict(D="Democratic", R="Republican")[party_letter]

        chamber_abbr = self._chamber_map[chamber]
        district = (row.xpath('./td[@class="rosterCell seatCell"]'
                              "/text()")[0].replace(chamber_abbr, "").strip())
        try:
            email = (row.xpath('./td[@class="rosterCell emailCell"]'
                               "/a/@href")[0].replace("mailto:", "").strip())
        except IndexError:
            email = None

        phone = (row.xpath('./td[@class="rosterCell phoneCell"]'
                           "/text()")[0].strip() or None)

        details_url = "https://leg.mt.gov{}".format(name_cell.attrib["href"])
        response = self.get(details_url)
        details_page = lxml.html.fromstring(response.text)

        address_lines = (details_page.xpath(
            '//div[@class="col-lg-6 col-md-12 text-lg-left align-self-center"]'
            '/p[contains(text(), "Address")]')[0].text_content().replace(
                "Address", "").split("\n"))
        address = "\n".join(
            [line.strip() for line in address_lines if len(line.strip()) > 0])

        legislator = Person(name=name,
                            district=district,
                            party=party,
                            primary_org=chamber)

        legislator.add_contact_detail(type="address",
                                      value=address,
                                      note="Capitol Office")
        if phone is not None:
            legislator.add_contact_detail(type="voice",
                                          value=phone,
                                          note="Capitol Office")

        if email is not None:
            legislator.add_contact_detail(type="email",
                                          value=email,
                                          note="E-mail")

        legislator.add_link(details_url)
        legislator.add_source(self._roster_url)

        yield legislator
Пример #34
0
    def get_member(self, session, chamber, kpid):
        url = '%smembers/%s' % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)['content']

        party = content['PARTY']
        if party == 'Democrat':
            party = 'Democratic'

        slug = {
            '2013-2014': 'b2013_14',
            '2015-2016': 'b2015_16',
            '2017-2018': 'b2017_18'
        }[session]
        leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug,
                                                                      kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            photo_url, = legislator_page.xpath(
                '//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(
                content['FULLNAME']))
            leg_url = ''
            photo_url = ''

        person = Person(
            name=content['FULLNAME'],
            district=str(content['DISTRICT']),
            primary_org=chamber,
            party=party,
            image=photo_url,
        )
        person.extras = {'occupation': content['OCCUPATION']}

        address = '\n'.join([
            'Room {}'.format(content['OFFICENUM']),
            'Kansas State Capitol Building',
            '300 SW 10th St.',
            'Topeka, KS 66612',
        ])

        note = 'Capitol Office'
        person.add_contact_detail(type='address', value=address, note=note)
        person.add_contact_detail(type='email',
                                  value=content['EMAIL'],
                                  note=note)
        if content['OFFPH']:
            person.add_contact_detail(type='voice',
                                      value=content['OFFPH'],
                                      note=note)

        person.add_source(url)
        person.add_link(leg_url)

        yield person
Пример #35
0
    def scrape_member_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
                "//div[contains(concat(' ', normalize-space(@class), ' '), "
                "' memberModule ')]"
                ):
            img = legislator.xpath(
                ".//div[@class='thumbnail']//img")[0].attrib['src']
            data = legislator.xpath(".//div[@class='data']")[0]
            homepage = data.xpath(".//a[@class='black']")[0]
            full_name = homepage.text_content()

            if "Vacant" in full_name:
                continue

            homepage = homepage.attrib['href']
            party = data.xpath(
                ".//span[@class='partyLetter']")[0].text_content()
            party = {"R": "Republican", "D": "Democratic"}[party]
            office_lines = data.xpath("child::text()")
            phone = office_lines.pop(-1)
            office = "\n".join(office_lines)
            h3 = data.xpath("./h3")
            if len(h3):
                h3 = h3[0]
                district = h3.xpath("./br")[0].tail.replace("District", ""
                                                            ).strip()
            else:
                district = re.findall(
                    r"\d+\.png",
                    legislator.attrib['style']
                )[-1].split(".", 1)[0]

            full_name = re.sub(r"\s+", " ", full_name).strip()
            email = (
                'rep{0:0{width}}@ohiohouse.gov'
                if chamber == 'lower' else
                'sd{0:0{width}}@ohiosenate.gov'
            ).format(int(district), width=2)

            leg = Person(name=full_name, district=district,
                         party=party, primary_org=chamber,
                         image=img)

            leg.add_contact_detail(type='address', value=office, note='Capitol Office')
            leg.add_contact_detail(type='voice', value=phone, note='Capitol Office')
            leg.add_contact_detail(type='email', value=email, note='Capitol Office')

            self.scrape_homepage(leg, chamber, homepage)

            leg.add_source(url)
            leg.add_link(homepage)
            yield leg
Пример #36
0
    def parse_senate(self, div, chamber):
        name = div.xpath('.//h3/text()')[0]
        if name.endswith(' (R)'):
            party = 'Republican'
        elif name.endswith(' (D)'):
            party = 'Democratic'
        else:
            self.warning('skipping ' + name)
            return None
        name = name.split(' (')[0]

        district = div.xpath(
            './/div[contains(@class, "senator-district")]/div/text()'
        )[0].strip().lstrip('0')
        photo_url = div.xpath('.//img/@src')[0]

        person = Person(
            name=name,
            party=party,
            district=district,
            primary_org=chamber,
            image=photo_url,
        )

        url = div.xpath('.//a/@href')[0]
        person.add_link(url)

        # CA senators have working emails, but they're not putting them on
        # their public pages anymore
        email = self._construct_email(chamber, name)

        person.add_contact_detail(type='email', value=email, note='Senate Office')

        office_path = './/div[contains(@class, "{}")]//p'

        for addr in div.xpath(office_path.format('views-field-field-senator-capitol-office')):
            note = 'Senate Office'
            addr, phone = addr.text_content().split('; ')
            person.add_contact_detail(type='address', value=addr.strip(), note=note)
            person.add_contact_detail(type='voice', value=phone.strip(), note=note)

        n = 1
        for addr in div.xpath(office_path.format('views-field-field-senator-district-office')):
            note = 'District Office #{}'.format(n)
            for addr in addr.text_content().strip().splitlines():
                try:
                    addr, phone = addr.strip().replace(u'\xa0', ' ').split('; ')
                    person.add_contact_detail(type='address', value=addr.strip(), note=note)
                    person.add_contact_detail(type='voice', value=phone.strip(), note=note)
                except ValueError:
                    addr = addr.strip().replace(u'\xa0', ' ')
                    person.add_contact_detail(type='address', value=addr.strip(), note=note)
            n += 1

        return person
Пример #37
0
    def scrape_senator(self, district):
        link = "https://legislature.maine.gov/District-{}".format(district)
        page = lxml.html.fromstring(self.get(link).text)
        page.make_links_absolute(link)

        main = page.xpath('//div[@id="main"]/div[@id="content"]')[0]
        title = main.xpath("h1")[0].text
        # e.g. District 25 - State Senator Catherine Breen (D - Cumberland)...
        title_match = re.match(
            r"District (\d+) - State Senator ([^\(]+) \(([DRI])", title)
        _, name, party = title_match.groups()
        name = re.sub(r"\s+", " ", name.strip())
        party = _party_map[party]

        image_url = address = phone = email = None

        for p in main.xpath("p"):
            if p.xpath(".//img") and not image_url:
                image_url = p.xpath(".//img/@src")[0]
                continue
            field, _, value = p.text_content().partition(":")
            value = value.strip()
            if field in ("Address", "Mailing Address"):
                address = value
            elif field in ("Phone", "Home Phone"):
                phone = value
            elif field == "Email":
                email = value

        person = Person(
            name=name,
            district=district,
            image=image_url,
            primary_org="upper",
            party=party,
        )

        person.add_link(link)
        person.add_source(link)

        if address:
            person.add_contact_detail(type="address",
                                      value=address,
                                      note="District Office")

        if phone:
            person.add_contact_detail(type="voice",
                                      value=clean_phone(phone),
                                      note="District Phone")
        person.add_contact_detail(type="email",
                                  value=email,
                                  note="District Email")

        yield person
Пример #38
0
    def _scrape_legislator(self, row, chamber):
        name_cell = row.xpath('./td[@class="rosterCell nameCell"]/a')[0]
        name = ' '.join([line.strip() for line
                         in name_cell.text_content().split('\n')
                         if len(line.strip()) > 0])

        party_letter = row.xpath(
            './td[@class="rosterCell partyCell"]/text()')[0].strip()
        party = dict(D='Democratic', R='Republican')[party_letter]

        chamber_abbr = self._chamber_map[chamber]
        district = row.xpath('./td[@class="rosterCell seatCell"]'
                             '/text()')[0].replace(chamber_abbr, '').strip()
        try:
            email = row.xpath('./td[@class="rosterCell emailCell"]'
                              '/a/@href')[0].replace('mailto:', '').strip()
        except IndexError:
            email = None

        phone = row.xpath('./td[@class="rosterCell phoneCell"]'
                          '/text()')[0].strip() or None

        details_url = 'https://leg.mt.gov{}'.format(name_cell.attrib['href'])
        response = self.get(details_url)
        details_page = lxml.html.fromstring(response.text)

        address_lines = details_page.xpath(
            '//div[@class="col-lg-6 col-md-12 text-lg-left align-self-center"]'
            '/p[contains(text(), "Address")]'
            )[0].text_content() \
                .replace('Address', '') \
                .split('\n')
        address = '\n'.join([line.strip() for line in address_lines
                             if len(line.strip()) > 0])

        legislator = Person(name=name,
                            district=district,
                            party=party,
                            primary_org=chamber)

        legislator.add_contact_detail(type='address', value=address,
                                      note='Capitol Office')
        if phone is not None:
            legislator.add_contact_detail(type='voice', value=phone,
                                          note='Capitol Office')

        if email is not None:
            legislator.add_contact_detail(type='email', value=email,
                                          note='E-mail')

        legislator.add_link(details_url)
        legislator.add_source(self._roster_url)

        yield legislator
Пример #39
0
    def scrape_chamber(self, chamber=None):
        metainf = self.scrape_leg_page(get_legislator_listing_url(chamber))
        for leg in metainf:
            try:
                chamber = {"House": "lower",
                           "Senate": "upper"}[leg['chamber']]
            except KeyError:
                print("")
                print("  ERROR: Bad Legislator page.")
                print("    -> " + "\n    -> ".join(leg['source']))
                print("")
                print("  Added this workaround because of a bad legislator")
                print("  page, while they filled their info out.")
                print("")
                print("  Emailed webmaster. Told to wait.")
                print("   - PRT, Jun 23, 2014")
                print("")
                continue

            person = Person(name=leg['name'], district=leg['district'],
                            party=leg['party'], primary_org=chamber,
                            image=leg['image'])

            for source in leg['source']:
                person.add_source( source )

            try:
                for ctty in leg['ctty']:
                    flag='Joint Legislative'
                    if ctty['name'][:len(flag)] == flag:
                        ctty_chamber = "joint"
                    else:
                        ctty_chamber = chamber

                    comm = Organization(name=ctty['name'],
                                classification="committee",
                                chamber=ctty_chamber)
                    comm.add_member(person,role="member")

            except KeyError:
                self.log( "XXX: Warning, %s has no scraped Committees" %
                    leg['name'] )

            person.add_link(leg['homepage'])
            person.add_source(leg['homepage'])
            if leg['addr']:
                person.add_contact_detail(type='address', value=leg['addr'], note='Capitol Office')
            if leg['phone']:
                person.add_contact_detail(type='voice', value=leg['phone'], note='Capitol Office')
            if leg['phone']:
                person.add_contact_detail(type='email', value=leg['phone'], note='Capitol Office')
            yield person
Пример #40
0
    def scrape_lower_legislator(self, url, leg_info):
        page = self.lxmlize(url)

        name = page.xpath(
            '//span[@id="body_FormView5_FULLNAMELabel"]/text()'
            )[0].strip()
        if name.startswith("District ") or name.startswith("Vacant "):
            self.warning("Seat is vacant: {}".format(name))
            return

        photo = page.xpath(
            '//img[contains(@src, "/h_reps/RepPics")]'
            )[0].attrib['src']
        party_flags = {
            "Democrat": "Democratic",
            "Republican": "Republican",
            "Independent": "Independent"
        }
        party_info = page.xpath(
            '//span[@id="body_FormView5_PARTYAFFILIATIONLabel"]/text()'
            )[0].strip()
        party = party_flags[party_info]
        try:
            email = page.xpath(
                '//span[@id="body_FormView6_EMAILADDRESSPUBLICLabel"]/text()'
                )[0].strip()
        except IndexError:
            email = None
        district = leg_info['dist'].replace('Dist', '').strip()

        person = Person(name=name,
                        party=party,
                        district=district,
                        primary_org='lower',
                        image=photo)

        contacts = [
            (leg_info["office"], "address"),
            (leg_info["phone"], "voice"),
            (email, "email"),
        ]

        for value, key in contacts:
            if value:
                person.add_contact_detail(type=key,
                                          value=value,
                                          note="District Office")

        person.add_source(url)
        person.add_link(url)

        yield person
Пример #41
0
    def scrape_chamber(self, chamber=None):
        metainf = self.scrape_leg_page(get_legislator_listing_url(chamber))
        for leg in metainf:
            try:
                chamber = {"House": "lower",
                           "Senate": "upper"}[leg['chamber']]
            except KeyError:
                print("")
                print("  ERROR: Bad Legislator page.")
                print("    -> " + "\n    -> ".join(leg['source']))
                print("")
                print("  Added this workaround because of a bad legislator")
                print("  page, while they filled their info out.")
                print("")
                print("  Emailed webmaster. Told to wait.")
                print("   - PRT, Jun 23, 2014")
                print("")
                continue

            person = Person(name=leg['name'], district=leg['district'],
                            party=leg['party'], primary_org=chamber,
                            image=leg['image'])

            for source in leg['source']:
                person.add_source(source)

            try:
                for ctty in leg['ctty']:
                    flag = 'Joint Legislative'
                    if ctty['name'][:len(flag)] == flag:
                        ctty_chamber = "joint"
                    else:
                        ctty_chamber = chamber

                    comm = Organization(name=ctty['name'], classification="committee",
                                        chamber=ctty_chamber)
                    comm.add_member(person, role="member")

            except KeyError:
                self.warn("%s has no scraped Committees" % leg['name'])

            person.add_link(leg['homepage'])

            if leg['addr']:
                person.add_contact_detail(type='address', value=leg['addr'], note='Capitol Office')
            if leg['phone']:
                person.add_contact_detail(type='voice', value=leg['phone'], note='Capitol Office')
            if leg['email']:
                person.add_contact_detail(type='email', value=leg['email'], note='Capitol Office')
            if leg['fax']:
                person.add_contact_detail(type='fax', value=leg['fax'], note='Capitol Office')
            yield person
Пример #42
0
    def scrape_senator_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
                "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), "
                "' portraitContainer ')]"):
            img = legislator.xpath(
                ".//div[@class='profileThumbnailBoundingBox']/@style")[0]
            img = img[img.find('(') + 1:img.find(')')]
            full_name = legislator.xpath(
                ".//div[@class='profileName']/a/text()")[0]
            homepage_url = legislator.xpath(
                ".//a[@class='profileImageLink']")[0].attrib['href']
            district = legislator.xpath(".//div[@class='profileDistrict']"
                                        "/a/text()")[0].split("#")[1]

            if "Vacant" in full_name:
                continue

            homepage = self.get(homepage_url).text
            page = lxml.html.fromstring(homepage)
            phone = page.xpath("//div[@class='phone']/span/text()")[0]

            address_lines = page.xpath("//div[@class='address']/span/text()")
            address = "\n".join(address_lines)

            email = ('rep{0:0{width}}@ohiohouse.gov' if chamber == 'lower' else
                     'sd{0:0{width}}@ohiosenate.gov').format(int(district),
                                                             width=2)

            leg = Person(name=full_name,
                         district=district,
                         primary_org=chamber,
                         image=img)

            leg.add_contact_detail(type='address',
                                   value=address,
                                   note='Capitol Office')
            leg.add_contact_detail(type='voice',
                                   value=phone,
                                   note='Capitol Office')
            leg.add_contact_detail(type='email',
                                   value=email,
                                   note='Capitol Office')

            leg.add_source(url)
            leg.add_link(homepage_url)
            yield leg
Пример #43
0
    def get_member(self, session, chamber, kpid):
        url = '%smembers/%s' % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)['content']

        party = content['PARTY']
        if party == 'Democrat':
            party = 'Democratic'

        slug = {'2013-2014': 'b2013_14',
                '2015-2016': 'b2015_16',
                '2017-2018': 'b2017_18',
                '2019-2020': 'b2019_20',
                }[session]
        leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug, kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            photo_url, = legislator_page.xpath(
                '//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(content['FULLNAME']))
            leg_url = ''
            photo_url = ''

        person = Person(
            name=content['FULLNAME'],
            district=str(content['DISTRICT']),
            primary_org=chamber,
            party=party,
            image=photo_url,
        )
        person.extras = {'occupation': content['OCCUPATION']}

        address = '\n'.join([
            'Room {}'.format(content['OFFICENUM']),
            'Kansas State Capitol Building',
            '300 SW 10th St.',
            'Topeka, KS 66612',
        ])

        note = 'Capitol Office'
        person.add_contact_detail(type='address', value=address, note=note)
        person.add_contact_detail(type='email', value=content['EMAIL'], note=note)
        if content['OFFPH']:
            person.add_contact_detail(type='voice', value=content['OFFPH'], note=note)

        person.add_source(url)
        person.add_link(leg_url)

        yield person
Пример #44
0
    def scrape_lower_legislator(self, url, leg_info):
        page = self.lxmlize(url)

        name = page.xpath(
            '//span[@id="body_FormView5_FULLNAMELabel"]/text()')[0].strip()
        if name.startswith("District ") or name.startswith("Vacant "):
            self.warning("Seat is vacant: {}".format(name))
            return

        photo = page.xpath(
            '//img[contains(@src, "/h_reps/RepPics")]')[0].attrib["src"]
        party_flags = {
            "Democrat": "Democratic",
            "Republican": "Republican",
            "Independent": "Independent",
        }
        party_info = page.xpath(
            '//span[@id="body_FormView5_PARTYAFFILIATIONLabel"]/text()'
        )[0].strip()
        party = party_flags[party_info]
        try:
            email = page.xpath(
                '//span[@id="body_FormView6_EMAILADDRESSPUBLICLabel"]/text()'
            )[0].strip()
        except IndexError:
            email = None
        district = leg_info["dist"].replace("Dist", "").strip()

        person = Person(name=name,
                        party=party,
                        district=district,
                        primary_org="lower",
                        image=photo)

        contacts = [
            (leg_info["office"], "address"),
            (leg_info["phone"], "voice"),
            (email, "email"),
        ]

        for value, key in contacts:
            if value:
                person.add_contact_detail(type=key,
                                          value=value,
                                          note="District Office")

        person.add_source(url)
        person.add_link(url)

        yield person
Пример #45
0
    def scrape_senator(self, district):
        link = "https://legislature.maine.gov/District-{}".format(district)
        page = lxml.html.fromstring(self.get(link).text)
        page.make_links_absolute(link)

        main = page.xpath('//div[@id="main"]/div[@id="content"]')[0]
        title = main.xpath('h1')[0].text
        # e.g. District 25 - State Senator Catherine Breen (D - Cumberland)...
        title_match = re.match(
            r'District (\d+) - State Senator ([^\(]+) \(([DRI])', title)
        _, name, party = title_match.groups()
        name = re.sub(r'\s+', ' ', name.strip())
        party = _party_map[party]

        image_url = address = phone = email = None

        for p in main.xpath('p'):
            if p.xpath('.//img') and not image_url:
                image_url = p.xpath('.//img/@src')[0]
                continue
            field, _, value = p.text_content().partition(":")
            value = value.strip()
            if field in ('Address', 'Mailing Address'):
                address = value
            elif field in ('Phone', 'Home Phone'):
                phone = value
            elif field == 'Email':
                email = value

        person = Person(
            name=name,
            district=district,
            image=image_url,
            primary_org='upper',
            party=party,
        )

        person.add_link(link)
        person.add_source(link)

        if address:
            person.add_contact_detail(type='address', value=address, note='District Office')

        if phone:
            person.add_contact_detail(
                type='voice', value=clean_phone(phone), note='District Phone')
        person.add_contact_detail(type='email', value=email, note='District Email')

        yield person
Пример #46
0
    def scrape_upper_chamber(self, term):
        url = 'https://senado.pr.gov/Pages/Senadores.aspx'

        doc = self.lxmlize(url)
        links = self.get_nodes(doc, '//ul[@class="senadores-list"]/li/a/@href')

        for link in links:
            senator_page = self.lxmlize(link)
            profile_links = self.get_nodes(senator_page, '//ul[@class="profiles-links"]/li')

            name_text = self.get_node(senator_page, '//span[@class="name"]').text_content().strip()
            # Convert to title case as some names are in all-caps
            name = re.sub(r'^Hon\.', '', name_text, flags=re.IGNORECASE).strip().title()
            party = profile_links[0].text_content().strip()
            # Translate to English since being an Independent is a universal construct
            if party == "Independiente":
                party = "Independent"

            photo_url = self.get_node(senator_page, '//div[@class="avatar"]//img/@src')

            if profile_links[1].text_content().strip() == "Senador por Distrito":
                district_text = self.get_node(
                    senator_page,
                    '//div[@class="module-distrito"]//span[@class="headline"]').text_content()
                district = district_text.replace('DISTRITO', '', 1).replace('\u200b', '').strip()
            elif profile_links[1].text_content().strip() == "Senador por Acumulación":
                district = "At-Large"

            phone_node = self.get_node(senator_page, '//a[@class="contact-data tel"]')
            phone = phone_node.text_content().strip()
            email_node = self.get_node(senator_page, '//a[@class="contact-data email"]')
            email = email_node.text_content().replace('\u200b', '').strip()

            person = Person(primary_org='upper',
                            district=district,
                            name=name,
                            party=party,
                            image=photo_url)
            person.add_contact_detail(type='email',
                                      value=email,
                                      note='Capitol Office')
            person.add_contact_detail(type='voice',
                                      value=phone,
                                      note='Capitol Office')
            person.add_link(link)
            person.add_source(link)

            yield person
Пример #47
0
    def handle_list_item(self, row):
        if not row["First Name"]:
            return
        name = "{} {}".format(row["First Name"], row["Last Name"])
        party = PARTIES[row["Party"]]
        leg = Person(
            name=name,
            district=row["District"].lstrip("0"),
            party=party,
            primary_org="upper",
            role="Senator",
            image=self.extra_info[name]["image"],
        )
        leg.add_link(self.extra_info[name]["url"])
        leg.add_contact_detail(
            type="voice", value=self.extra_info[name]["office_phone"], note="capitol"
        )
        if "email" in self.extra_info[name]:
            leg.add_contact_detail(
                type="email", value=self.extra_info[name]["email"], note="capitol"
            )

        row["Zipcode"] = row["Zipcode"].strip()
        # Accommodate for multiple address column naming conventions.
        address1_fields = [row.get("Address"), row.get("Office Building")]
        address2_fields = [row.get("Address2"), row.get("Office Address")]
        row["Address"] = next((a for a in address1_fields if a is not None), False)
        row["Address2"] = next((a for a in address2_fields if a is not None), False)

        if (
            a in row["Address2"]
            for a in ["95 University Avenue W", "100 Rev. Dr. Martin Luther King"]
        ):
            address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format(**row)
            if "Rm. Number" in row:
                address = "{0} {1}".format(row["Rm. Number"], address)
            leg.add_contact_detail(type="address", value=address, note="capitol")
        elif row["Address2"]:
            address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format(**row)
            leg.add_contact_detail(type="address", value=address, note="district")
        else:
            address = "{Address}\n{City}, {State} {Zipcode}".format(**row)
            leg.add_contact_detail(type="address", value=address, note="district")

        leg.add_source(self.url)
        leg.add_source(self._html_url)

        return leg
Пример #48
0
    def scrape_senator_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
                "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), "
                "' portraitContainer ')]"):
            img = legislator.xpath(".//div[@class='profileThumbnailBoundingBox']/@style")[0]
            img = img[img.find('(')+1:img.find(')')]
            full_name = legislator.xpath(".//div[@class='profileName']/a/text()")[0]
            homepage_url = legislator.xpath(".//a[@class='profileImageLink']")[0].attrib['href']
            district = legislator.xpath(".//div[@class='profileDistrict']"
                                        "/a/text()")[0].split("#")[1]

            if "Vacant" in full_name:
                continue

            homepage = self.get(homepage_url).text
            page = lxml.html.fromstring(homepage)
            phone = page.xpath("//div[@class='phone']/span/text()")[0]

            address_lines = page.xpath("//div[@class='address']/span/text()")
            address = "\n".join(address_lines)

            party_image = page.xpath('//div[@class="senatorParty"]/img/@src')[0]
            if 'Republican' in party_image:
                party = 'Republican'
            elif 'Democrat' in party_image:
                party = 'Democratic'

            email = (
                'rep{0:0{width}}@ohiohouse.gov'
                if chamber == 'lower' else
                'sd{0:0{width}}@ohiosenate.gov'
            ).format(int(district), width=2)

            leg = Person(name=full_name, district=district,
                         primary_org=chamber, image=img, party=party)

            leg.add_contact_detail(type='address', value=address, note='Capitol Office')
            leg.add_contact_detail(type='voice', value=phone, note='Capitol Office')
            leg.add_contact_detail(type='email', value=email, note='Capitol Office')

            leg.add_source(url)
            leg.add_link(homepage_url)
            yield leg
Пример #49
0
    def handle_list_item(self, row):
        if not row['First Name']:
            return
        name = '{} {}'.format(row['First Name'], row['Last Name'])
        party = PARTIES[row['Party']]
        leg = Person(name=name, district=row['District'].lstrip('0'),
                     party=party, primary_org='upper', role='Senator',
                     image=self.extra_info[name]['image'])
        leg.add_link(self.extra_info[name]['url'])
        leg.add_contact_detail(type='voice',
                               value=self.extra_info[name]['office_phone'], note='capitol')
        if 'email' in self.extra_info[name]:
            leg.add_contact_detail(type='email',
                                   value=self.extra_info[name]['email'], note='capitol')

        row['Zipcode'] = row['Zipcode'].strip()
        # Accommodate for multiple address column naming conventions.
        address1_fields = [row.get('Address'), row.get('Office Building')]
        address2_fields = [row.get('Address2'), row.get('Office Address')]
        row['Address'] = next((a for a in address1_fields if a is not
                               None), False)
        row['Address2'] = next((a for a in address2_fields if a is not
                                None), False)

        if (a in row['Address2'] for a in ['95 University Avenue W',
                                           '100 Rev. Dr. Martin Luther King']):
            address = ('{Address}\n{Address2}\n{City}, {State} {Zipcode}'
                       .format(**row))
            if 'Rm. Number' in row:
                address = '{0} {1}'.format(row['Rm. Number'], address)
            leg.add_contact_detail(type='address', value=address,
                                   note='capitol')
        elif row['Address2']:
            address = ('{Address}\n{Address2}\n{City}, {State} {Zipcode}'
                       .format(**row))
            leg.add_contact_detail(type='address', value=address,
                                   note='district')
        else:
            address = '{Address}\n{City}, {State} {Zipcode}'.format(**row)
            leg.add_contact_detail(type='address', value=address,
                                   note='district')

        leg.add_source(self.url)
        leg.add_source(self._html_url)

        return leg
Пример #50
0
    def scrape_chamber(self, chamber):
        client = ApiClient(self)
        session = self.latest_session()
        base_url = "http://iga.in.gov/legislative"
        api_base_url = "https://api.iga.in.gov"
        chamber_name = "senate" if chamber == "upper" else "house"
        r = client.get("chamber_legislators", session=session, chamber=chamber_name)
        all_pages = client.unpaginate(r)
        for leg in all_pages:
            firstname = leg["firstName"]
            lastname = leg["lastName"]
            party = leg["party"]
            link = leg["link"]
            api_link = api_base_url+link
            html_link = base_url+link.replace("legislators/", "legislators/legislator_")
            try:
                html = get_with_increasing_timeout(self, html_link, fail=True,
                                                   kwargs={"verify": False})
            except scrapelib.HTTPError:
                self.logger.warning("Legislator's page is not available.")
                continue
            doc = lxml.html.fromstring(html.text)
            doc.make_links_absolute(html_link)
            address, phone = doc.xpath("//address")
            address = address.text_content().strip()
            address = "\n".join([l.strip() for l in address.split("\n")])
            phone = phone.text_content().strip()
            try:
                district = doc.xpath("//span[@class='district-heading']"
                                     )[0].text.lower().replace("district", "").strip()
            except IndexError:
                self.warning("skipping legislator w/o district")
                continue
            image_link = base_url+link.replace("legislators/", "portraits/legislator_")
            legislator = Person(primary_org=chamber,
                                district=district,
                                name=" ".join([firstname, lastname]),
                                party=party,
                                image=image_link)
            legislator.add_contact_detail(type="address", note="Capitol Office", value=address)
            legislator.add_contact_detail(type="voice", note="Capitol Office", value=phone)
            legislator.add_link(html_link)
            legislator.add_source(html_link)
            legislator.add_source(api_link)

            yield legislator
Пример #51
0
    def scrape_chamber(self, chamber):
        leg_list_url = utils.urls['people'][chamber]
        page = self.get(leg_list_url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(leg_list_url)

        # email addresses are hidden away on a separate page now, at
        # least for Senators
        contact_url = utils.urls['contacts'][chamber]
        contact_page = self.get(contact_url).text
        contact_page = lxml.html.fromstring(contact_page)

        for link in page.xpath("//a[contains(@href, '_bio.cfm')]"):
            full_name = ' '.join(link.text.split(', ')[::-1])
            full_name = re.sub(r'\s+', ' ', full_name)
            district = link.getparent().getnext().tail.strip()
            district = re.search(r"District (\d+)", district).group(1)

            party = link.getparent().tail.strip()[-2]
            if party == 'R':
                party = 'Republican'
            elif party == 'D':
                party = 'Democratic'

            url = link.get('href')
            leg_id = url.split('?id=')[1]

            person = Person(name=full_name, district=district, party=party,
                            primary_org=chamber)
            person.add_link(leg_list_url)
            person.add_source(leg_list_url)

            # Scrape email, offices, photo.
            page = self.get(url).text
            doc = lxml.html.fromstring(page)
            doc.make_links_absolute(url)

            email = self.scrape_email_address(contact_page, leg_id)
            self.scrape_offices(url, doc, person, email)
            self.scrape_photo_url(url, doc, person)

            yield person
Пример #52
0
    def scrape_chamber(self, chamber):
        url = {
            'upper': 'https://legis.delaware.gov/json/Senate/GetSenators',
            'lower': 'https://legis.delaware.gov/json/House/' +
                     'GetRepresentatives',
            }[chamber]
        source_url = {
            'upper': 'https://legis.delaware.gov/Senate',
            'lower': 'https://legis.delaware.gov/House',
        }[chamber]

        data = self.post(url).json()['Data']

        for item in data:
            if item['PersonFullName'] is None:
                # Vacant district
                self.warning(
                    'District {} was detected as vacant'.format(
                        item['DistrictNumber']
                    )
                )
                continue

            leg_url = 'https://legis.delaware.gov/' +\
                      'LegislatorDetail?personId={}'.format(item['PersonId'])

            doc = self.lxmlize(leg_url)
            image_url = doc.xpath('//img/@src')[0]

            leg = Person(name=item['PersonFullName'],
                         district=str(item['DistrictNumber']),
                         party=PARTY[item['PartyCode']],
                         primary_org=chamber,
                         image=image_url
                         )
            self.scrape_contact_info(leg, doc)
            leg.add_link(leg_url, note="legislator page")
            leg.add_source(source_url, note="legislator list page")
            yield leg
Пример #53
0
    def handle_list_item(self, item):
        name = item.text

        if 'resigned' in name.lower() or 'vacated' in name.lower():
            return
        if (name in CHAMBER_MOVES and(self.chamber != CHAMBER_MOVES[name])):
            return

        name, action, date = clean_name(name)

        leg = Person(name=name)
        leg.add_source(self.url)
        leg.add_source(item.get('href'))
        leg.add_link(item.get('href'))
        yield from self.scrape_page(
            self.detail_page,
            item.get('href'),
            session=self.kwargs['session'],
            committees=self.kwargs['committees'],
            obj=leg,
        )
        yield leg
Пример #54
0
    def handle_list_item(self, item):
        name = " ".join(item.xpath('.//text()'))
        name = re.sub(r'\s+', " ", name).replace(" ,", ",").strip()

        if 'Vacant' in name:
            return

        district = item.xpath("string(../../td[1])")
        party = item.xpath("string(../../td[2])")
        if party == 'Democrat':
            party = 'Democratic'

        leg_url = item.get('href')

        leg = Person(name=name, district=district, party=party, primary_org='upper', role='Senator')
        leg.add_link(leg_url)
        leg.add_source(self.url)
        leg.add_source(leg_url)

        self.scrape_page(SenDetail, leg_url, obj=leg)

        return leg
Пример #55
0
    def handle_list_item(self, item):
        photo_url = item.xpath('./td[1]/a/img/@src')[0]
        info_nodes = item.xpath('./td[2]/p/a')
        name_text = info_nodes[0].xpath('./b/text()')[0]
        url = info_nodes[0].get('href')

        name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip('0').upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [x.strip() for x in item.xpath(
            './td[2]/p/text()[normalize-space() and preceding-sibling::br]'
        ) if x.strip()]
        address = '\n'.join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_node = info_nodes[1]
        email_text = email_node.text
        email_text = email_text.replace('Email: ', '').strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(name=name, district=district, party=party,
                     primary_org='lower', role='Representative',
                     image=photo_url)
        rep.add_link(url)
        rep.add_contact_detail(type='address', value=address)
        rep.add_contact_detail(type='voice', value=phone)
        rep.add_contact_detail(type='email', value=email)
        rep.add_source(self.url)

        yield rep