Exemplo n.º 1
0
    def scrape(self, chamber, term_name):
        year = term_name[0:4]
        if int(year) < 2001:
            raise NoDataForPeriod(year)

        if ((int(year) - 2010) % 2) == 1:
            session = ((int(year) -  2010) / 2) + 76
        elif( ((int(year) - 2010) % 2) == 0) and year >= 2010:
            session = ((int(year) - 2010) / 2) + 26
        else:
            raise NoDataForPeriod(term_name)

        self.scrape_legislators(chamber, session, year, term_name)
Exemplo n.º 2
0
    def scrape(self, chamber, year):
        if year not in metadata['sessions']:
            raise NoDataForPeriod(year)

        self.scrape_session(chamber, year)
        for sub in metadata['session_details'][year]['sub_sessions']:
            self.scrape_session(chamber, sub)
Exemplo n.º 3
0
    def scrape(self, chamber, session):
        if session != '2011':
            raise NoDataForPeriod(session)

        # start by building subject map
        self.scrape_subjects(chamber, session)

        url = "http://webserver1.lsb.state.ok.us/WebApplication3/WebForm1.aspx"
        form_page = lxml.html.fromstring(self.urlopen(url))

        if chamber == 'upper':
            chamber_letter = 'S'
        else:
            chamber_letter = 'H'

        values = [('cbxSessionId', self.session_id_map[session]),
                  ('cbxActiveStatus', 'All'),
                  ('RadioButtonList1', 'On Any day'),
                  ('Button1', 'Retrieve')]

        for bill_type in self.bill_types:
            values.append(('lbxTypes', chamber_letter + bill_type))

        for hidden in form_page.xpath("//input[@type='hidden']"):
            values.append((hidden.attrib['name'], hidden.attrib['value']))

        page = self.urlopen(url, "POST", urllib.urlencode(values))
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//a[contains(@href, 'BillInfo')]"):
            bill_id = link.text.strip()
            self.scrape_bill(chamber, session, bill_id, link.attrib['href'])
Exemplo n.º 4
0
    def _scrape_upper_chamber(self, year):
        # We only have data back to 2005.
        if int(year) < 2005:
            raise NoDataForPeriod(year)

        self.info('Scraping bills from upper chamber.')

        year2 = "%02d" % (int(year) % 100)

        # Save the root URL, since we'll use it later.
        bill_root = 'http://www.senate.mo.gov/{}info/BTS_Web/'.format(year2)
        index_url = bill_root + 'BillList.aspx?SessionType=R'

        index_page = self.get(index_url).text
        index_page = lxml.html.fromstring(index_page)
        # Each bill is in it's own table (nested within a larger table).
        bill_tables = index_page.xpath('//a[@id]')

        if not bill_tables:
            return

        for bill_table in bill_tables:
            # Here we just search the whole table string to get the BillID that
            # the MO senate site uses.
            if re.search(r'dgBillList.*hlBillNum',bill_table.attrib['id']):
                #print "keys = %s" % bill_table.attrib['id']
                #print "table = %s " % bill_table.attrib.get('href')
                self._parse_senate_billpage(bill_root + bill_table.attrib.get('href'), year)
Exemplo n.º 5
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        if chamber == 'upper':
            chamber_abbrev = 'sen'
            title_abbrev = 'sen'
        else:
            chamber_abbrev = 'hse'
            title_abbrev = 'del'

        url = "http://www.legis.state.wv.us/districts/maps/%s_dist.cfm" % (
            chamber_abbrev)
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        view_url = '%smemview' % title_abbrev
        for link in page.xpath("//a[contains(@href, '%s')]" % view_url):
            name = link.xpath("string()").strip()
            leg_url = urlescape(link.attrib['href'])

            if name in [
                    'Members', 'Senate Members', 'House Members', 'Vacancy'
            ]:
                continue

            self.scrape_legislator(chamber, term, name, leg_url)
Exemplo n.º 6
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        chamber_abbr = {'upper': 's', 'lower': 'h'}[chamber]

        url = "http://le.utah.gov/asp/interim/standing.asp?house=%s" % chamber_abbr
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for comm_link in page.xpath("//a[contains(@href, 'Com=')]"):
                comm_name = comm_link.text.strip()

                # Drop leading "House" or "Senate" from name
                comm_name = re.sub(r"^(House|Senate) ", "", comm_name)

                comm = Committee(chamber, comm_name)

                for mbr_link in comm_link.xpath(
                        "../../../font[2]/a[not(contains(@href, 'mailto'))]"):

                    name = mbr_link.text.strip()

                    next_el = mbr_link.getnext()
                    if next_el is not None and next_el.tag == 'i':
                        type = next_el.text.strip()
                    else:
                        type = 'member'

                    comm.add_member(name, type)

                self.save_committee(comm)
Exemplo n.º 7
0
    def scrape(self, chamber, term):
        # Pennsylvania doesn't make member lists easily available
        # for previous sessions, unfortunately
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        leg_list_url = legislators_url(chamber)

        with self.urlopen(leg_list_url) as page:
            page = lxml.html.fromstring(page)

            for link in page.xpath("//a[contains(@href, '_bio.cfm')]"):
                full_name = link.text[0:-4]
                district = re.search("District (\d+)", link.tail).group(1)

                party = link.text[-2]
                if party == 'R':
                    party = 'Republican'
                elif party == 'D':
                    party = 'Democratic'

                legislator = Legislator(term, chamber, district,
                                        full_name, party=party)
                legislator.add_source(leg_list_url)
                self.save_legislator(legislator)
Exemplo n.º 8
0
    def scrape_house(self, year):
        if int(year) < 2000 or int(year) > dt.date.today().year:
            raise NoDataForPeriod(year)

        bill_page_url = ('%s/BillList.aspx?year=%s' %
                         (self.senate_base_url, year))
        self.parse_house_billpage(bill_page_url, year)
Exemplo n.º 9
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        chamber_name = {'upper': 'Senate', 'lower': 'House'}[chamber]

        url = ("http://www.in.gov/cgi-bin/legislative/listing/"
               "listing-2.pl?data=alpha&chamber=%s" % chamber_name)

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for link in page.xpath("//div[@id='col2']/p/a"):
                name = link.text.strip()

                details = link.getnext().text.strip()

                party = details.split(',')[0]
                if party == 'Democrat':
                    party = 'Democratic'

                district = re.search(r'District (\d+)', details).group(1)
                district = district.lstrip('0')

                leg = Legislator(term, chamber, district, name, '', '', '',
                                 party)
                leg.add_source(url)

                self.save_legislator(leg)
Exemplo n.º 10
0
    def scrape(self, chamber, term):
        if term != '27':
            raise NoDataForPeriod(term)

        if chamber == 'upper':
            chamber_abbr = 'S'
            url = 'http://senate.legis.state.ak.us/'
            search = 'senator'
        else:
            chamber_abbr = 'H'
            url = 'http://house.legis.state.ak.us/'
            search = 'rep'

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            seen = set()
            for link in page.xpath("//a[contains(@href, '%s')]" % search):
                name = link.text

                # Members of the leadership are linked twice three times:
                # one image link and two text links. Don't double/triple
                # scrape them
                if not name or link.attrib['href'] in seen:
                    continue
                seen.add(link.attrib['href'])

                self.scrape_legislator(chamber, term,
                                       link.xpath('string()').strip(),
                                       link.attrib['href'])
Exemplo n.º 11
0
    def scrape(self, chamber, session):
        if session != '2011 Regular Session':
            raise NoDataForPeriod(session)

        url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx"
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"):
                date = div.xpath("string(../../span[1])").strip()

                try:
                    time, location = div.xpath("string(span[1])").split(',')
                except ValueError:
                    # No meetings
                    continue

                when = "%s %s" % (date, time)
                when = datetime.datetime.strptime(when,
                                                  "%A, %B %d, %Y %I:%M%p")
                when = self._tz.localize(when)

                desc = div.xpath("string(span[2])").strip()
                event = Event(session,
                              when,
                              'committee:meeting',
                              desc,
                              location=location)
                event.add_source(url)

                self.save_event(event)
Exemplo n.º 12
0
    def scrape(self, chamber, year):
        year = int(year)
        session = self.getSession(year)
        #2 year terms starting on odd year, so if even number, use the previous odd year
        if year < 1999:
            raise NoDataForPeriod(year)
        if year % 2 == 0:
            year -= 1

        if year == 1999:
            base_bill_url = 'http://data.opi.mt.gov/bills/BillHtml/'
        else:
            base_bill_url = 'http://data.opi.mt.gov/bills/%d/BillHtml/' % year
        index_page = ElementTree(
            lxml.html.fromstring(self.urlopen(base_bill_url)))

        bill_urls = []
        for bill_anchor in index_page.findall('//a'):
            # See 2009 HB 645
            if bill_anchor.text.find("govlineveto") == -1:
                # House bills start with H, Senate bills start with S
                if chamber == 'lower' and bill_anchor.text.startswith('H'):
                    bill_urls.append("%s%s" %
                                     (base_bill_url, bill_anchor.text))
                elif chamber == 'upper' and bill_anchor.text.startswith('S'):
                    bill_urls.append("%s%s" %
                                     (base_bill_url, bill_anchor.text))

        for bill_url in bill_urls:
            bill = self.parse_bill(bill_url, session, chamber)
            self.save_bill(bill)
Exemplo n.º 13
0
    def scrape_senate(self, year):
        # We only have data from 2005-present
        if int(year) < 2005 or int(year) > dt.date.today().year:
            raise NoDataForPeriod(year)

        year2 = "%02d" % (int(year) % 100)

        # year is mixed in to the directory. set a root_url, since
        # we'll use it later
        bill_root = 'http://www.senate.mo.gov/%sinfo/BTS_Web/' % year2
        index_url = bill_root + 'BillList.aspx?SessionType=R'
        #print "index = %s" % index_url

        with self.urlopen(index_url) as index_page:
            index_page = lxml.html.fromstring(index_page)
            # each bill is in it's own table (nested in a larger table)
            bill_tables = index_page.xpath('//a[@id]')

            if not bill_tables:
                return

            for bill_table in bill_tables:
                # here we just search the whole table string to get
                # the BillID that the MO senate site uses
                if re.search(r'dgBillList.*hlBillNum',bill_table.attrib['id']):
                    #print "keys = %s" % bill_table.attrib['id']
                    #print "table = %s " % bill_table.attrib.get('href')
                    self.parse_senate_billpage(bill_root + bill_table.attrib.get('href'), year)
Exemplo n.º 14
0
    def scrape_senate(self, year):
        # We only have data from 2005-2009
        if int(year) < 2005 or int(year) > dt.date.today().year:
            raise NoDataForPeriod(year)

        year2 = "%02d" % (int(year) % 100)

        # year is mixed in to the directory. set a root_url, since
        # we'll use it later
        bill_root = self.senate_root + '/' + year2 + 'info/BTS_Web/'
        index_url = bill_root + 'BillList.aspx?SessionType=R'

        with self.urlopen(index_url) as index_page:
            index_page = BeautifulSoup(index_page)
            # each bill is in it's own table (nested in a larger table)
            bill_tables = index_page.findAll(id="Table2")

            if not bill_tables:
                return

            for bill_table in bill_tables:
                # here we just search the whole table string to get
                # the BillID that the MO senate site uses
                m = re.search(r"BillID=(\d*)", str(bill_table))
                if m:
                    bill_web_id = m.group(1)
                    bill_url = (bill_root +
                                '/Bill.aspx?SessionType=R&BillID=' +
                                bill_web_id)

                    self.parse_senate_billpage(bill_url, year)
Exemplo n.º 15
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        if chamber == 'upper':
            url = ('http://www.legis.state.pa.us/cfdocs/legis/'
                   'home/member_information/senators_ca.cfm')
        else:
            url = ('http://www.legis.state.pa.us/cfdocs/legis/'
                   'home/member_information/representatives_ca.cfm')

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            committees = {}

            for li in page.xpath("//a[contains(@href, 'bio.cfm')]/../.."):
                name = li.xpath("string(b/a[contains(@href, 'bio.cfm')])")
                name = name[0:-4]

                for link in li.xpath("a"):
                    if not link.tail:
                        continue

                    committee_name = link.tail.strip()
                    committee_name = re.sub(r"\s+", " ", committee_name)
                    subcommittee_name = None
                    role = 'member'

                    rest = link.getnext().text
                    if rest:
                        match = re.match(r',\s+(Subcommittee on .*)\s+-', rest)

                        if match:
                            subcommittee_name = match.group(1)
                            role = rest.split('-')[1].strip().lower()
                        else:
                            role = rest.replace(', ', '').strip().lower()

                        if role == 'chairman':
                            role = 'chair'

                    try:
                        committee = committees[(chamber, committee_name,
                                                subcommittee_name)]
                    except KeyError:
                        committee = Committee(chamber, committee_name)
                        committee.add_source(url)

                        if subcommittee_name:
                            committee['subcommittee'] = subcommittee_name

                        committees[(chamber, committee_name,
                                    subcommittee_name)] = committee

                    committee.add_member(name, role)

            for committee in committees.values():
                self.save_committee(committee)
Exemplo n.º 16
0
    def scrape(self, chamber, session):
        if session != '27':
            raise NoDataForPeriod(session)

        if chamber == 'other':
            return

        year, year2 = None, None
        for term in self.metadata['terms']:
            if term['sessions'][0] == session:
                year = str(term['start_year'])
                year2 = str(term['end_year'])
                break

        # Full calendar year
        date1 = '0101' + year[2:]
        date2 = '1231' + year[2:]

        url = ("http://www.legis.state.ak.us/basis/"
               "get_hearing.asp?session=%s&Chamb=B&Date1=%s&Date2=%s&"
               "Comty=&Root=&Sel=1&Button=Display" % (session, date1, date2))

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            path = "//font[starts-with(., '(H)') or starts-with(., '(S)')]"
            for font in page.xpath(path):
                match = re.match(r'^\((H|S)\)(.+)$', font.text)

                chamber = {'H': 'lower', 'S': 'upper'}[match.group(1)]
                comm = match.group(2).strip().title()

                next_row = font.xpath("../../following-sibling::tr[1]")[0]

                when = next_row.xpath("string(td[1]/font)").strip()
                when = datetime.datetime.strptime(when + " " + year,
                                                  "%b %d  %A %I:%M %p %Y")
                when = self._tz.localize(when)

                where = next_row.xpath("string(td[2]/font)").strip()

                description = "Committee Meeting\n"
                description += comm

                links = font.xpath(
                    "../../td/font/a[contains(@href, 'get_documents')]")
                if links:
                    agenda_link = links[0]
                    print agenda_link
                    event['link'] = agenda_link.attrib['href']

                event = Event(session,
                              when,
                              'committee:meeting',
                              description,
                              location=where)
                event.add_source(url)
                self.save_event(event)
Exemplo n.º 17
0
    def scrape(self, chamber, term):
        if term != self.metadata['terms'][-1]['name']:
            raise NoDataForPeriod(term)

        if chamber == 'upper':
            self.scrape_senate()
        else:
            self.scrape_house()
Exemplo n.º 18
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        if chamber == "upper":
            self.scrape_upper()
        elif chamber == "lower":
            self.scrape_lower()
Exemplo n.º 19
0
    def scrape(self, chamber, term):
        if term != '2009-2010':
            raise NoDataForPeriod(term)

        if chamber == "upper":
            self.scrape_senate()
        elif chamber == "lower":
            self.scrape_assembly()
Exemplo n.º 20
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        if chamber == 'upper':
            self.scrape_senators(chamber, term)
        else:
            self.scrape_reps(chamber, term)
Exemplo n.º 21
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        if chamber == 'lower':
            self.scrape_lower(term)
        else:
            self.scrape_upper(term)
Exemplo n.º 22
0
    def scrape(self, chamber, session):
        if session != '2010':
            raise NoDataForPeriod(session)

        if chamber == 'lower':
            self.scrape_house_weekly_schedule(session)

        self.scrape_committee_schedule(session, chamber)
Exemplo n.º 23
0
    def scrape(self, chamber, term):
        if term != '20112012':
            raise NoDataForPeriod(term)

        if chamber == 'upper':
            self.scrape_upper_committees(term)
        else:
            self.scrape_lower_committees(term)
Exemplo n.º 24
0
    def scrape(self, chamber, term):
        # Data available for this term only
        if term != '2010':
            raise NoDataForPeriod(term)

        if chamber == "upper":
            self.scrape_senate()
        elif chamber == "lower":
            self.scrape_house()
Exemplo n.º 25
0
    def scrape(self, chamber, session):
        # Data available for this session only
        if year_from_session(session) != 2010:
            raise NoDataForPeriod(session)

        if chamber == 'upper':
            self.scrape_senate(session)
        elif chamber == 'lower':
            self.scrape_house(session)
Exemplo n.º 26
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        if chamber == 'upper':
            chamber_name = 'senate'
        else:
            chamber_name = 'house'

        url = "http://www.legis.iowa.gov/Legislators/%s.aspx" % chamber_name
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)
        table = page.xpath('//table[@class="legis"]')[0]
        for link in table.xpath(".//a[contains(@href, 'legislator.aspx')]"):
            name = link.text.strip()
            leg_url = link.get('href')
            district = link.xpath("string(../../td[2])")
            party = link.xpath("string(../../td[3])")
            email = link.xpath("string(../../td[5])")

            if party == 'Democrat':
                party = 'Democratic'

            pid = re.search("PID=(\d+)", link.attrib['href']).group(1)
            photo_url = ("http://www.legis.iowa.gov/getPhotoPeople.aspx"
                         "?GA=84&PID=%s" % pid)

            leg = Legislator(term, chamber, district, name, party=party,
                             email=email, photo_url=photo_url, url=url)
            leg.add_source(url)

            leg_page = lxml.html.fromstring(self.urlopen(link.attrib['href']))
            comm_path = "//a[contains(@href, 'committee')]"
            for comm_link in leg_page.xpath(comm_path):
                comm = comm_link.text.strip()

                match = re.search(r'\((.+)\)$', comm)
                if match:
                    comm = re.sub(r'\((.+)\)$', '', comm).strip()
                    mtype = match.group(1).lower()
                else:
                    mtype = 'member'

                if comm.endswith('Appropriations Subcommittee'):
                    sub = re.match('^(.+) Appropriations Subcommittee$',
                                   comm).group(1)
                    leg.add_role('committee member', term, chamber=chamber,
                                 committee='Appropriations',
                                 subcommittee=sub,
                                 position=mtype)
                else:
                    leg.add_role('committee member', term, chamber=chamber,
                                 committee=comm,
                                 position=mtype)

            self.save_legislator(leg)
Exemplo n.º 27
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        office_code = {'upper': 'S', 'lower': 'H'}[chamber]

        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        page = urllib2.urlopen(leg_url)
        page = csv.DictReader(page)

        for row in page:
            if office_code != row['office code']:
                continue

            district = row['dist'].lstrip('0')

            name = row['first name']
            mid = row['middle initial'].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row['last name']
            suffix = row['suffix'].strip()
            if suffix:
                name += " %s" % suffix

            party = row['party']
            if party == 'Democrat':
                party = 'Democratic'

            office_address = "%s, Room %s\nHartford, CT 06106-1591" % (
                row['capitol street address'], row['room number'])

            leg = Legislator(term,
                             chamber,
                             district,
                             name,
                             first_name=row['first name'],
                             last_name=row['last name'],
                             middle_name=row['middle initial'],
                             suffixes=row['suffix'],
                             party=party,
                             email=row['email'],
                             url=row['URL'],
                             office_address=office_address,
                             office_phone=row['capitol phone'])
            leg.add_source(leg_url)

            for comm_code in row['committee codes'].split(';'):
                if comm_code:
                    comm_name = self._committee_names[comm_code]
                    leg.add_role('committee member',
                                 term,
                                 chamber='joint',
                                 committee=comm_name)

            self.save_legislator(leg)
Exemplo n.º 28
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        session = ((int(term[0:4]) - 2009) / 2) + 124

        if chamber == 'upper':
            self.scrape_senators(chamber, session, term)
        elif chamber == 'lower':
            self.scrape_reps(chamber, session, term)
Exemplo n.º 29
0
    def _scrape_lower_chamber(self, year):
        # We only have data back to 2000.
        if int(year) < 2000:
            raise NoDataForPeriod(year)

        self.info('Scraping bills from lower chamber.')

        bill_page_url = '{}/BillList.aspx?year={}'.format(
            self._senate_base_url,year)
        self._parse_house_billpage(bill_page_url, year)
Exemplo n.º 30
0
    def scrape(self, chamber, year):
        if year != '2010':
            raise NoDataForPeriod(year)

        if chamber == 'upper':
            url = 'http://legis.state.nm.us/lcs/leg.aspx?T=S'
        else:
            url = 'http://legis.state.nm.us/lcs/leg.aspx?T=R'

        self.scrape_legislator_data(url, chamber)