示例#1
0
    def parse_bill(self, chamber, session, special, link):
        bill_num = link.text.strip()
        bill_type = re.search('type=(B|R|)', link.attrib['href']).group(1)
        bill_id = "%s%s %s" % (bill_abbr(chamber), bill_type, bill_num)

        url = info_url(chamber, session, special, bill_type, bill_num)
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath(
                "//td[text() = 'Short Title:']/following-sibling::td")[0]
            title = title.text.strip()

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(url)

            self.parse_bill_versions(bill, page)

            self.parse_history(bill, history_url(chamber, session, special,
                                                 bill_type, bill_num))

            self.parse_votes(bill, vote_url(chamber, session, special,
                                            bill_type, bill_num))

            self.save_bill(bill)
示例#2
0
    def scrape_session(self, chamber, session, special=0):
        session_url = bill_list_url(chamber, session, special)
        
        with self.urlopen(session_url) as bill_list_page:
            bill_list_page = BeautifulSoup(bill_list_page)
            bill_link_re = "body=%s&type=(B|R)&bn=\d+" % bill_abbr(chamber)

            for link in bill_list_page.findAll(href=re.compile(bill_link_re)):
                self.parse_bill(chamber, session, special, link)
示例#3
0
    def parse_bill(self, chamber, session, special, link):
        bill_number = link.contents[0]
        type = re.search('type=(B|R|)', link['href']).group(1)
        bill_id = "%s%s %s" % (bill_abbr(chamber), type, bill_number)

        bill_info_url = info_url(chamber, session, special, type, bill_number)

        with self.urlopen(bill_info_url) as info_page:
            info_page = BeautifulSoup(info_page)
            title_label = info_page.find(text='Short Title:')
            title = title_label.findNext().contents[0]

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(bill_info_url)

            self.parse_bill_versions(bill, info_page)

            self.parse_history(bill, history_url(chamber, session, special,
                                                 type, bill_number))

            self.parse_votes(bill, vote_url(chamber, session, special,
                                            type, bill_number))

            self.save_bill(bill)