def parse_bill(self, chamber, session, special, link): bill_num = link.text.strip() bill_type = re.search('type=(B|R|)', link.attrib['href']).group(1) bill_id = "%s%s %s" % (bill_abbr(chamber), bill_type, bill_num) url = info_url(chamber, session, special, bill_type, bill_num) with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath( "//td[text() = 'Short Title:']/following-sibling::td")[0] title = title.text.strip() bill = Bill(session, chamber, bill_id, title) bill.add_source(url) self.parse_bill_versions(bill, page) self.parse_history(bill, history_url(chamber, session, special, bill_type, bill_num)) self.parse_votes(bill, vote_url(chamber, session, special, bill_type, bill_num)) self.save_bill(bill)
def scrape_session(self, chamber, session, special=0): session_url = bill_list_url(chamber, session, special) with self.urlopen(session_url) as bill_list_page: bill_list_page = BeautifulSoup(bill_list_page) bill_link_re = "body=%s&type=(B|R)&bn=\d+" % bill_abbr(chamber) for link in bill_list_page.findAll(href=re.compile(bill_link_re)): self.parse_bill(chamber, session, special, link)
def parse_bill(self, chamber, session, special, link): bill_number = link.contents[0] type = re.search('type=(B|R|)', link['href']).group(1) bill_id = "%s%s %s" % (bill_abbr(chamber), type, bill_number) bill_info_url = info_url(chamber, session, special, type, bill_number) with self.urlopen(bill_info_url) as info_page: info_page = BeautifulSoup(info_page) title_label = info_page.find(text='Short Title:') title = title_label.findNext().contents[0] bill = Bill(session, chamber, bill_id, title) bill.add_source(bill_info_url) self.parse_bill_versions(bill, info_page) self.parse_history(bill, history_url(chamber, session, special, type, bill_number)) self.parse_votes(bill, vote_url(chamber, session, special, type, bill_number)) self.save_bill(bill)