示例#1
0
 def scrape(self, chamber, session):
     if year_from_session(session) != 2009:
         raise NoDataForPeriod(session)
     
     if chamber == 'upper':
         self.scrape_legislator_data('upper', session)
     else:
         self.scrape_legislator_data('lower', session)
示例#2
0
    def scrape(self, chamber, session):
        sep = '<h1>House</h1>'

        if chamber == 'upper':
            after = False
            reg = '[5-9]'
        else:
            after = True
            reg = '[1-4]'

        year = str(year_from_session(session))

        with self.urlopen("http://apps.leg.wa.gov/billinfo/dailystatus.aspx?year=" + year) as page_html:
            page = lxml.html.fromstring(separate_content(page_html, sep))

            for element, attribute, link, pos in page.iterlinks():
                if re.search("bill=" + reg + "[0-9]{3}", link) != None:
                    bill_page_url = "http://apps.leg.wa.gov/billinfo/" + link
                    with self.urlopen(bill_page_url) as bill_page_html:
                        bill_page = lxml.html.fromstring(bill_page_html)
                        raw_title = bill_page.cssselect('title')
                        split_title = string.split(raw_title[0].text_content(), ' ')
                        bill_id = split_title[0] + ' ' + split_title[1]
                        bill_id = bill_id.strip()

                        title_element = bill_page.get_element_by_id("ctl00_ContentPlaceHolder1_lblSubTitle")
                        title = title_element.text_content()

                        bill = Bill(session, chamber, bill_id, title)
                        bill.add_source(bill_page_url)

                        self.scrape_actions(bill_page, bill)

                        for element, attribute, link, pos in bill_page.iterlinks():
                            if re.search("billdocs", link) != None:
                                if re.search("Amendments", link) != None:
                                    bill.add_document("Amendment: " + element.text_content(), link)
                                elif re.search("Bills", link) != None:
                                    bill.add_version(element.text_content(), link)
                                else:
                                    bill.add_document(element.text_content(), link)
                            elif re.search("senators|representatives", link) != None:
                                with self.urlopen(link) as senator_page_html:
                                    senator_page = lxml.html.fromstring(senator_page_html)
                                    try:
                                        name_tuple = self.scrape_legislator_name(senator_page)
                                        bill.add_sponsor('primary', name_tuple[0])
                                    except:
                                        pass
                            elif re.search("ShowRollCall", link) != None:
                                match = re.search("([0-9]+,[0-9]+)", link)
                                match = match.group(0)
                                match = match.split(',')
                                id1 = match[0]
                                id2 = match[1]
                                url = votes_url(id1, id2)
                                with self.urlopen(url) as vote_page_html:
                                    vote_page = lxml.html.fromstring(vote_page_html)
                                    self.scrape_votes(vote_page, bill, url)

                        self.save_bill(bill)