예제 #1
0
    def scrape_legislators(self, chamber, year):
        if year not in self.metadata['session_details']:
            raise NoDataForYear(year)

        if chamber == 'lower':
            title = 'Representative'
        else:
            title = 'Senator'

        url = 'http://www.le.state.ut.us/asp/roster/roster.asp?year=%s' % year
        leg_list = self.soup_parser(self.urlopen(url))

        for row in leg_list.findAll('table')[1].findAll('tr')[1:]:
            tds = row.findAll('td')

            leg_title = tds[1].find(text=True)
            if leg_title == title:
                fullname = tds[0].find(text=True)
                last_name = fullname.split(',')[0]
                first_name = fullname.split(' ')[1]
                if len(fullname.split(' ')) > 2:
                    middle_name = fullname.split(' ')[2]

                leg = Legislator(year, chamber, tds[3].find(text=True),
                                 fullname, first_name, last_name,
                                 middle_name, tds[2].find(text=True))
                leg.add_source(url)
                self.add_legislator(leg)
예제 #2
0
    def scrape_legislators(self, chamber, year):
        if year != "2009":
            raise NoDataForYear(year)

        session = "%d-%d" % (int(year), int(year) + 1)

        url = "http://www.ncga.state.nc.us/gascripts/members/" "memberList.pl?sChamber="

        if chamber == "lower":
            url += "House"
        else:
            url += "Senate"

        with self.urlopen_context(url) as leg_list_data:
            leg_list = self.soup_parser(leg_list_data)
            leg_table = leg_list.find("div", id="mainBody").find("table")

            for row in leg_table.findAll("tr")[1:]:
                party = row.td.contents[0].strip()
                if party == "Dem":
                    party = "Democrat"
                elif party == "Rep":
                    party = "Republican"

                district = row.findAll("td")[1].contents[0].strip()
                full_name = row.findAll("td")[2].a.contents[0].strip()
                full_name = full_name.replace(u"\u00a0", " ")
                (first_name, last_name, middle_name, suffix) = split_name(full_name)

                legislator = Legislator(
                    session, chamber, district, full_name, first_name, last_name, middle_name, party, suffix=suffix
                )
                legislator.add_source(url)
                self.add_legislator(legislator)
예제 #3
0
    def scrape_reps(self, year):
        if year != '2009':
            return

        leg_page_url = "http://www.flhouse.gov/Sections/Representatives/"\
            "representatives.aspx"
        leg_page = BeautifulSoup(self.urlopen(leg_page_url))

        table = leg_page.find('table',
                              id='ctl00_ContentPlaceHolder1_ctrlContentBox'\
                                  '_ctrlPageContent_ctl00_dgLegislators')

        for row in table.findAll('tr')[1:]:
            full = row.findAll('td')[1].a.contents[0].replace('  ', ' ')
            (last, first, middle) = self.split_name(full)

            district = row.findAll('td')[3].contents[0]
            party = row.findAll('td')[2].contents[0]

            if party == 'D':
                party = 'Democrat'
            elif party == 'R':
                party = 'Republican'

            leg = Legislator(year, 'lower', district, full,
                             first, last, middle, party)
            leg.add_source(leg_page_url)
            self.add_legislator(leg)
예제 #4
0
    def scrape_reps(self, year):
        rep_url = 'http://www.house.state.tx.us/members/welcome.php'
        with self.urlopen_context(rep_url) as page:
            root = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

            for el in root.xpath('//form[@name="frmMembers"]/table/tr')[1:]:
                full_name = el.xpath('string(td/a/font/span)')
                district = el.xpath('string(td[2]/span)')
                county = el.xpath('string(td[3]/span)')

                if full_name.startswith('District'):
                    # Ignore empty seats
                    continue

                pre, first, last, suffixes = name_tools.split(full_name)
                party = ''

                leg = Legislator('81', 'lower', district,
                                 full_name, first, last,
                                 '', party, suffix=suffixes)
                leg.add_source(rep_url)

                # Is there anything out there that handles meta refresh?
                redirect_url = el.xpath('td/a')[0].attrib['href']
                redirect_url = ('http://www.house.state.tx.us/members/' +
                                redirect_url)
                details_url = redirect_url
                with self.urlopen_context(redirect_url) as redirect_page:
                    redirect = lxml.etree.fromstring(redirect_page,
                                                     lxml.etree.HTMLParser())

                    try:
                        filename = redirect.xpath(
                            "//meta[@http-equiv='refresh']"
                            )[0].attrib['content']

                        filename = filename.split('0;URL=')[1]

                        details_url = details_url.replace('welcome.htm',
                                                          filename)
                    except:
                        # The Speaker's member page does not redirect.
                        # The Speaker is not on any committees
                        # so we can just continue with the next member.
                        self.save_legislator(leg)
                        continue


                with self.urlopen_context(details_url) as details_page:
                    details = lxml.etree.fromstring(details_page,
                                                    lxml.etree.HTMLParser())

                    comms = details.xpath(
                        "//b[contains(text(), 'Committee Assignments')]/"
                        "..//a")
                    for comm in comms:
                        leg.add_role('committee member', '81',
                                     committee=comm.text.strip())

                self.save_legislator(leg)
예제 #5
0
    def scrape_senators(self, year):
        senator_url = 'http://www.senate.state.tx.us/75r/senate/senmem.htm'
        with self.urlopen_context(senator_url) as page:
            root = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

            for el in root.xpath('//table[@summary="senator identification"]'):
                sen_link = el.xpath('tr/td[@headers="senator"]/a')[0]
                full_name = sen_link.text
                district = el.xpath('string(tr/td[@headers="district"])')
                party = el.xpath('string(tr/td[@headers="party"])')

                pre, first, last, suffixes = name_tools.split(full_name)

                leg = Legislator('81', 'upper', district, full_name,
                                 first, last, '', party,
                                 suffix=suffixes)
                leg.add_source(senator_url)

                details_url = ('http://www.senate.state.tx.us/75r/senate/' +
                               sen_link.attrib['href'])
                with self.urlopen_context(details_url) as details_page:
                    details = lxml.etree.fromstring(details_page,
                                                    lxml.etree.HTMLParser())

                    comms = details.xpath("//h2[contains(text(), 'Committee Membership')]")[0]
                    comms = comms.getnext()
                    for comm in comms.xpath('li/a'):
                        comm_name = comm.text
                        if comm.tail:
                            comm_name += comm.tail

                        leg.add_role('committee member', '81',
                                     committee=comm_name.strip())

                self.save_legislator(leg)
예제 #6
0
    def scrape_legislators(self, chamber, year):
        year = int(year)
        session = self.internal_sessions[year][0][1]
        # iterating through subsessions would be a better way to do this..
        if year % 2 == 0 and (year != dt.date.today().year or year + 1 != dt.date.today().year):
            raise NoDataForYear(year)

        if chamber == "upper":
            url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=senate"
        else:
            url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=assembly"

        body = unicode(self.urlopen(url), "latin-1")
        page = lxml.html.fromstring(body)

        for row in page.cssselect("#ctl00_C_dgLegData tr"):
            if len(row.cssselect("td a")) > 0:
                rep_url = list(row)[0].cssselect("a[href]")[0].get("href")
                (full_name, party) = re.findall(r"([\w\-\,\s\.]+)\s+\(([\w])\)", list(row)[0].text_content())[0]

                pre, first, last, suffixes = name_tools.split(full_name)

                district = str(int(list(row)[2].text_content()))

                leg = Legislator(session, chamber, district, full_name, first, last, "", party, suffix=suffixes)
                leg.add_source(rep_url)

                leg = self.add_committees(leg, rep_url, session)
                self.save_legislator(leg)
예제 #7
0
    def scrape_pre_2003_legislators(self, chamber, year, session, suffix):
        url = 'http://leg.mt.gov/css/Sessions/%d%s/legname.asp' % (session, suffix)
        page_data = self.parser(self.urlopen(url))
        if year == 2001:
            if chamber == 'upper':
                tableName = '57th Legislatore Roster Senate (2001-2002)'
                startRow = 3
            else:
                tableName = '57th Legislator Roster (House)(2001-2002)'
                startRow = 5
        elif year == 1999:
            if chamber == 'upper':
                tableName = 'Members of the Senate'
                startRow = 3
            else:
                tableName = 'Members of the House'
                startRow = 5
        for row in page_data.find('table', attrs = {'name' : tableName}).findAll('tr')[startRow:]:
            row = row.findAll('td')
            #Ignore row with just email in it
            if str(row[0].contents[0]).strip() == ' ':
                continue
            #Parse different locations for name if name is a link
            if row[0].find('a'):
                name = row[0].contents[0].next
                #print name.next
                party_letter = name.next[2]
            else:
                if chamber == 'upper' and year == 2001:
                    name, party_letter = row[0].contents[2].rsplit(' (', 1)
                else:
                    name, party_letter = row[0].contents[0].rsplit(' (', 1)
                party_letter = party_letter[0]

            #Get first name, last name, and suffix out of name string
            nameParts = [namePart.strip() for namePart in name.split(',')]
            assert len(nameParts) < 4
            if len(nameParts) == 2:
                #Case last_name, first_name
                last_name, first_name = nameParts
            elif len(nameParts) == 3:
                #Case last_name, suffix, first_name
                last_name = ' '.join(nameParts[0:2])
                first_name = nameParts[2]

            district = row[2].contents[0].strip()

            if party_letter == 'R':
                party = 'Republican'
            elif party_letter == 'D':
                party = 'Democrat'
            else:
                #Haven't yet run into others, so not sure how the state abbreviates them
                party = party_letter

            legislator = Legislator(session, chamber, district, '%s %s' % (first_name, last_name), \
                                    first_name, last_name, '', party)
            legislator.add_source(url)
            self.add_legislator(legislator)
예제 #8
0
    def scrape_legislators(self, chamber, year):
        if int(year) != 2009:
            return
        session = "%s-%d" % (year, int(year) + 1)

        # What Vermont claims are Word and Excel files are actually
        # just HTML tables
        # What Vermont claims is a CSV file is actually one row of comma
        # separated values followed by a ColdFusion error.
        leg_url = "http://www.leg.state.vt.us/legdir/"\
            "memberdata.cfm/memberdata.doc?FileType=W"
        leg_table = BeautifulSoup(self.urlopen(leg_url))

        for tr in leg_table.findAll('tr')[1:]:
            leg_cham = tr.findAll('td')[3].contents[0]
            if leg_cham == 'H' and chamber == 'upper':
                continue
            if leg_cham == 'S' and chamber == 'lower':
                continue

            district = tr.findAll('td')[5].contents[0]
            district = district.replace(' District', '').strip()
            first = tr.findAll('td')[6].contents[0]

            middle = tr.findAll('td')[7]
            if len(middle.contents) == 0:
                middle = ''
            else:
                middle = middle.contents[0].strip()

            last = tr.findAll('td')[8].contents[0]

            if len(middle) == 0:
                full = "%s, %s" % (last, first)
            else:
                full = "%s, %s %s." % (last, first, middle)

            official_email = tr.findAll('td')[9]
            if len(official_email.contents) == 0:
                official_email = ''
            else:
                official_email = official_email.contents[0]

            party = tr.findAll('td')[4].contents[0]
            if party == 'D':
                party = 'Democrat'
            elif party == 'R':
                party = 'Republican'
            elif party == 'I':
                party = 'Independent'
            elif party == 'P':
                party = 'Progressive'

            leg = Legislator(session, chamber, district, full,
                             first, last, middle, party,
                             official_email=official_email)
            leg.add_source(leg_url)
            self.save_legislator(leg)
예제 #9
0
    def scrape_legislators(self, chamber, year):
        # Data available for 1993 on
        if int(year) < 1993 or int(year) > dt.date.today().year:
            raise NoDataForYear(year)

        # Expect first year of session (odd)
        if int(year) % 2 != 1:
            raise NoDataForYear(year)

        if chamber == 'upper':
            chamber_abbr = 'S'
        else:
            chamber_abbr = 'H'

        session = str(18 + ((int(year) - 1993) / 2))

        leg_list_url = "http://www.legis.state.ak.us/"\
            "basis/commbr_info.asp?session=%s" % session
        leg_list = self.soup_parser(self.urlopen(leg_list_url))

        leg_re = "get_mbr_info.asp\?member=.+&house=%s&session=%s" % (
            chamber_abbr, session)
        links = leg_list.findAll(href=re.compile(leg_re))

        for link in links:
            member_url = "http://www.legis.state.ak.us/basis/" + link['href']
            member_page = self.soup_parser(self.urlopen(member_url))

            if member_page.find('td', text=re.compile('Resigned')):
                # Need a better way to handle this than just dropping
                continue

            full_name = member_page.findAll('h3')[1].contents[0]
            full_name = ' '.join(full_name.split(' ')[1:])
            full_name = re.sub('\s+', ' ', full_name).strip()

            first_name = full_name.split(' ')[0]
            last_name = full_name.split(' ')[-1]
            middle_name = ' '.join(full_name.split(' ')[1:-1])

            code = link['href'][24:27]

            district = member_page.find(text=re.compile("District:"))
            district = district.strip().split(' ')[-1]

            party = member_page.find(text=re.compile("Party: "))
            party = ' '.join(party.split(' ')[1:])

            leg = Legislator(session, chamber, district,
                             full_name, first_name,
                             last_name, middle_name,
                             party, code=code)
            leg.add_source(member_url)
            self.save_legislator(leg)
예제 #10
0
    def fetch_member(self, url, name, session, chamber):
        abbr = {'R': 'Republican', 'D': 'Democrat', 'I': 'Independent'}
        url = "http://leg1.state.va.us/%s" % url
        with self.soup_context(url) as member:
            ex = member.findAll('table', text=re.compile(re.escape(name)))
            if ex == []:
                raise Exception("Parse error fetching member %s" % name)
            else:
                ex = ex[0].parent.nextSibling.nextSibling.string.split()

            # Some people are "Joe X. Schmoe;Resigned". Fantastic.
            name = re.split('\;|\(', name)[0]
            # some other people are Joe X. Schmoe (resigned
            name_parts = name.split()
            first_name = name_parts[0]
            last = name_parts[len(name_parts)-1]
            if re.match(r'[IV]+$|\bJr\b\.$|\b(Sr)\b\.$', last):
                last_name = name_parts[len(name_parts)-2]
            else:
                last_name = last
            
            if name_parts[1] == last_name:
                middle_name = ''
            else:
                middle_name = name_parts[1]

            # Deal with the Van Houtens of the world
            # also, watch out for their rugged Danish relatives...
            if name_parts[1] == 'Van':
                middle_name = ''
                last_name = name_parts[1] + ' ' + last_name

            last_name = last_name.replace(',','')

            middle_name = middle_name.replace('.', '')
            party = ex[0][1]
            district = ex[len(ex)-1]
	
            leg = Legislator(session=session, chamber=chamber, district=district, 
	                                           full_name=name.strip(), first_name=first_name.strip(), last_name=last_name.strip(), 
	                                           middle_name=middle_name.replace('.', '').strip(), party=abbr[party])
            leg.add_source(url)
            # [_,_,district,_]
            # so... yeah. not totally sure how I should handle legislators in subsessions
            # but I'll only add them if the matcher doesn't already know about them.
            sanitized = leg['full_name'].replace('.', '').lower()
            if self.matcher[chamber][sanitized] and self.matcher[chamber][sanitized][2] == district:
                return
            self.save_legislator(leg)
예제 #11
0
    def scrape_new_legislators(self, chamber, session):
        """
        Scrape legislators from 2009 and later.
        """

        if chamber == 'upper':
            search = 'Senate Members'
        else:
            search = 'House Members'

        leg_list_url = "http://legis.state.sd.us/sessions/%s/"\
            "MemberMenu.aspx" % (session)
        leg_list = self.soup_parser(self.urlopen(leg_list_url))

        list_div = leg_list.find(text=search).findNext('div')

        for link in list_div.findAll('a'):
            full_name = link.contents[0].strip()
            first_name = full_name.split(', ')[1].split(' ')[0]
            last_name = full_name.split(',')[0]
            middle_name = ''

            leg_page_url = "http://legis.state.sd.us/sessions/%s/%s" % (
                session, link['href'])
            leg_page = self.soup_parser(self.urlopen(leg_page_url))

            party = leg_page.find(
                id="ctl00_contentMain_spanParty").contents[0].strip()

            district = leg_page.find(
                id="ctl00_contentMain_spanDistrict").contents[0]
            district = district.strip().lstrip('0')

            occ_span = leg_page.find(id="ctl00_contentMain_spanOccupation")
            if len(occ_span.contents) > 0:
                occupation = occ_span.contents[0].strip()
            else:
                occupation = None

            legislator = Legislator(session, chamber, district,
                                    full_name, first_name, last_name,
                                    middle_name, party,
                                    occupation=occupation)
            legislator.add_source(leg_page_url)
            self.save_legislator(legislator)
예제 #12
0
    def parse_legislator(self, chamber, year, full_name, district, url):
        with self.soup_context(url) as leg_page:
            name_str = leg_page.find("strong").contents[0].strip()

            if name_str.endswith("(D)"):
                party = "Democrat"
            elif name_str.endswith("(R)"):
                party = "Republican"
            elif name_str.endswith("(I)"):
                party = "Independent"
            else:
                party = "Other"

            full_name = full_name.replace("\n", "").replace("&quot;", '"')
            full_name = full_name.replace("\t", "").replace("\r", "")
            (first_name, last_name, middle_name) = split_name(full_name)

            legislator = Legislator(year, chamber, district, full_name, first_name, last_name, middle_name, party)
            legislator.add_source(url)

            self.save_legislator(legislator)
예제 #13
0
    def parse_legislator(self, chamber, year, full_name, district, url):
        with self.soup_context(url) as leg_page:
            name_str = leg_page.find('strong').contents[0].strip()

            if name_str.endswith('(D)'):
                party = 'Democrat'
            elif name_str.endswith('(R)'):
                party = 'Republican'
            elif name_str.endswith('(I)'):
                party = 'Independent'
            else:
                party = 'Other'

            full_name = full_name.replace('\n', '').replace('&quot;', '"')
            full_name = full_name.replace('\t', '').replace('\r', '')
            (first_name, last_name, middle_name) = split_name(full_name)

            legislator = Legislator(year, chamber, district, full_name,
                                    first_name, last_name, middle_name, party)
            legislator.add_source(url)

            self.add_legislator(legislator)
예제 #14
0
    def scrape_senators(self, year):
        if year != '2009':
            return

        leg_page_url = "http://www.flsenate.gov/Legislators/"\
            "index.cfm?Mode=Member%20Pages&Submenu=1&Tab=legislators"
        leg_page = BeautifulSoup(self.urlopen(leg_page_url))

        th = leg_page.find('th', text='Legislator').parent
        table = th.parent.parent

        for row in table.findAll('tr')[1:]:
            full = row.td.a.contents[0].replace('  ', ' ')
            (last, first, middle) = self.split_name(full)

            district = row.findAll('td')[1].contents[0]
            party = row.findAll('td')[2].contents[0]

            leg = Legislator(year, 'upper', district, full,
                             first, last, middle, party)
            leg.add_source(leg_page_url)
            self.add_legislator(leg)
예제 #15
0
    def scrape_old_legislators(self, chamber, session):
        """
        Scrape pre-2009 legislators.
        """
        if chamber == 'upper':
            chamber_name = 'Senate'
        else:
            chamber_name = 'House'

        if int(session) < 2008:
            filename = 'district.htm'
        else:
            filename = 'MembersDistrict.htm'

        leg_list_url = "http://legis.state.sd.us/sessions/%s/%s" % (
            session, filename)
        leg_list = self.soup_parser(self.urlopen(leg_list_url))

        for district_str in leg_list.findAll('h2'):
            district = district_str.contents[0].split(' ')[1].lstrip('0')

            for row in district_str.findNext('table').findAll('tr')[1:]:
                if row.findAll('td')[1].contents[0].strip() != chamber_name:
                    continue

                full_name = row.td.a.contents[0].strip()
                first_name = full_name.split(', ')[1].split(' ')[0]
                last_name = full_name.split(',')[0]
                middle_name = ''

                party = row.findAll('td')[3].contents[0].strip()
                occupation = row.findAll('td')[4].contents[0].strip()

                legislator = Legislator(session, chamber, district,
                                        full_name, first_name, last_name,
                                        middle_name, party=party,
                                        occupation=occupation)
                legislator.add_source(leg_list_url)
                self.save_legislator(legislator)
예제 #16
0
    def scrape_post_2003_legislators(self, chamber, year, session, suffix):
        url = 'http://leg.mt.gov/content/sessions/%d%s/%d%sMembers.txt' % \
            (session, suffix, year, chamber == 'upper' and 'Senate' or 'House')

        #Currently 2009 is different
        if year > 2008:
            csv_parser = csv.reader(self.urlopen(url).split(os.linesep), delimiter = '\t')
            #Discard title row
            csv_parser.next()
        else:
            csv_parser = csv.reader(self.urlopen(url).split(os.linesep))

        for entry in csv_parser:
            if not entry:
                continue
            if year == 2003:
                first_name, last_name = entry[0].split(' ', 2)[1:3]
                party_letter = entry[1]
                district = entry[2]
            else:
                last_name = entry[0]
                first_name = entry[1]
                party_letter = entry[2]
                district = entry[3]#.split('D ')[1]
            if party_letter == '(R)':
                party = 'Republican'
            elif party_letter == '(D)':
                party = 'Democrat'
            else:
                party = party_letter
            first_name = first_name.capitalize()
            last_name = last_name.capitalize()
            #All we care about is the number
            district = district.split('D ')[1]

            legislator = Legislator(session, chamber, district, '%s %s' % (first_name, last_name), \
                                    first_name, last_name, '', party)
            legislator.add_source(url)
            self.add_legislator(legislator)
예제 #17
0
    def scrape_legislators(self, chamber, year):
        # Pennsylvania doesn't make member lists easily available
        # for previous sessions, unfortunately
        if int(year) < 2009:
            #raise NoDataForYear(year)
            return

        session = "%s-%d" % (year, int(year) + 1)
        leg_list_url = legislators_url(chamber)

        with self.soup_context(leg_list_url) as member_list_page:
            for link in member_list_page.findAll(
                'a', href=re.compile('_bio\.cfm\?id=')):

                full_name = link.contents[0][0:-4]
                last_name = full_name.split(',')[0]
                first_name = full_name.split(' ')[1]

                if len(full_name.split(' ')) > 2:
                    middle_name = full_name.split(' ')[2].strip(',')
                else:
                    middle_name = ''

                party = link.contents[0][-2]
                if party == 'R':
                    party = "Republican"
                elif party == 'D':
                    party = "Democrat"

                district = re.search(
                    "District (\d+)", link.parent.contents[1]).group(1)

                legislator = Legislator(session, chamber, district,
                                        full_name, first_name, last_name,
                                        middle_name, party)
                legislator.add_source(leg_list_url)
                self.add_legislator(legislator)
예제 #18
0
    def scrape_legislators(self, chamber, year):
        if year != '2009':
            raise NoDataForYear(year)

        session = "%d-%d" % (int(year), int(year) + 1)

        url = "http://www.ncga.state.nc.us/gascripts/members/"\
            "memberList.pl?sChamber="

        if chamber == 'lower':
            url += 'House'
        else:
            url += 'Senate'

        with self.urlopen_context(url) as leg_list_data:
            leg_list = self.soup_parser(leg_list_data)
            leg_table = leg_list.find('div', id='mainBody').find('table')

            for row in leg_table.findAll('tr')[1:]:
                party = row.td.contents[0].strip()
                if party == 'Dem':
                    party = 'Democrat'
                elif party == 'Rep':
                    party = 'Republican'

                district = row.findAll('td')[1].contents[0].strip()
                full_name = row.findAll('td')[2].a.contents[0].strip()
                full_name = full_name.replace(u'\u00a0', ' ')
                (first_name, last_name, middle_name, suffix) = split_name(
                    full_name)

                legislator = Legislator(session, chamber, district, full_name,
                                        first_name, last_name, middle_name,
                                        party, suffix=suffix)
                legislator.add_source(url)
                self.save_legislator(legislator)
예제 #19
0
    def scrape_legislators(self, chamber, year):
        if year != "2009":
            raise NoDataForYear

        l1 = Legislator("2009-2010", chamber, "1st", "Bob Smith", "Bob", "Smith", "", "Democrat")

        if chamber == "upper":
            l1.add_role("President of the Senate", "2009-2010")
        else:
            l1.add_role("Speaker of the House", "2009-2010")

        l1.add_source("http://example.com/Bob_Smith.html")

        l2 = Legislator("2009-2010", chamber, "2nd", "Sally Johnson", "Sally", "Johnson", "", "Republican")
        l2.add_role("Minority Leader", "2009-2010")
        l2.add_source("http://example.com/Sally_Johnson.html")

        self.save_legislator(l1)
        self.save_legislator(l2)
예제 #20
0
    def scrape_legislators(self, chamber, year):
        """
        Scrape the ND legislators seated in a given chamber during a given year.
        """    
        # Error checking
        if year not in self.metadata['session_details']:
            raise NoDataForYear(year)
        
        # No legislator data for 1997 (though other data is available)
        if year == '1997':
            raise NoDataForYear(year)
        
        # URL building
        if chamber == 'upper':
            url_chamber_name = 'senate'
            norm_chamber_name = 'Senate'
            url_member_name = 'senators'
        else:
            url_chamber_name = 'house'
            norm_chamber_name = 'House'
            url_member_name = 'representatives'
        
        assembly_url = '/assembly/%i-%s/%s' % (
            self.metadata['session_details'][str(year)]['number'],
            year,
            url_chamber_name)
        
        list_url = \
            self.site_root + \
            assembly_url + \
            '/members/last-name.html'    
        
        # Parsing
        soup = self.parser.parse(self.urlopen(list_url))
        
        if not soup:
            raise ScrapeError('Failed to parse legaslative list page.')
        
        header = soup.find('h2')
        
        if not header:
            raise ScrapeError('Legaslative list header element not found.')
        
        party_images = {'/images/donkey.gif': 'Democrat', '/images/elephant.gif': 'Republican'}
        for row in header.findNextSibling('table').findAll('tr'):
            cells = row.findAll('td')
            party = party_images[cells[0].img['src']]
            name = map(lambda x: x.strip(), cells[1].a.contents[0].split(', '))
            name.reverse()
            name = ' '.join(name)
            district = re.findall('District (\d+)', cells[2].contents[0])[0]
            attributes = {
                'session': year,
                'chamber': chamber,
                'district': district,
                'party': party,
                'full_name': name,
            }
            split_name = name.split(' ')
            if len(split_name) > 2:
                attributes['first_name'] = split_name[0]
                attributes['middle_name'] = split_name[1].strip(' .')
                attributes['last_name'] = split_name[2]
            else:
                attributes['first_name'] = split_name[0]
                attributes['middle_name'] = u''
                attributes['last_name'] = split_name[1]

            # we can get some more data..
            bio_url = self.site_root + cells[1].a['href']
            try:
                attributes.update(self.scrape_legislator_bio(bio_url))
            except urllib2.HTTPError: 
                self.log("failed to fetch %s" % bio_url)

            self.debug("attributes: %d", len(attributes))
            self.debug(attributes)
            # Save
            legislator = Legislator(**attributes)
            legislator.add_source(bio_url)
            self.save_legislator(legislator)