Exemplo n.º 1
0
    def scrape_bill(self, chamber, session, bill_id):
        biennium = "%s-%s" % (session[0:4], session[7:9])
        bill_num = bill_id.split()[1]

        url = "%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, biennium, bill_num)

        with self.urlopen(url) as page:
            page = lxml.etree.fromstring(page).xpath("//wa:Legislation", namespaces=self._ns)[0]

            title = page.xpath("string(wa:LongDescription)", namespaces=self._ns)

            bill_type = page.xpath(
                "string(wa:ShortLegislationType/wa:LongLegislationType)", namespaces=self._ns
            ).lower()

            if bill_type == "gubernatorial appointment":
                return

            bill = Bill(session, chamber, bill_id, title, type=[bill_type])

            sponsor = page.xpath("string(wa:Sponsor)", namespaces=self._ns).strip("() \t\r\n")
            bill.add_sponsor("sponsor", sponsor)

            chamber_name = {"lower": "House", "upper": "Senate"}[chamber]
            version_url = "http://www.leg.wa.gov/pub/billinfo/2011-12/" "Htm/Bills/%s %ss/%s.htm" % (
                chamber_name,
                bill_type.title(),
                bill_num,
            )
            bill.add_version(bill_id, version_url)

            self.scrape_actions(bill)

            self.save_bill(bill)
Exemplo n.º 2
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            # search for Titulo, accent over i messes up lxml, so use 'tulo'
            title = doc.xpath(
                u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()'
            )
            if not title:
                raise NoSuchBill()
            bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
            author = doc.xpath(
                u'//td/b[contains(text(),"Autor")]/../text()')[0]
            bill.add_sponsor('primary', author.strip())

            action_table = doc.xpath('//table')[-1]
            for row in action_table[1:]:
                tds = row.xpath('td')

                # ignore row missing date
                if len(tds) != 2:
                    continue

                date = datetime.datetime.strptime(tds[0].text_content(),
                                                  "%m/%d/%Y")
                action = tds[1].text_content()
                bill.add_action(chamber, action, date)

                # also has an associated version
                if tds[1].xpath('a'):
                    bill.add_version(action, tds[1].xpath('a/@href')[0])

            bill.add_source(url)
            self.save_bill(bill)
Exemplo n.º 3
0
    def _parse_bill(self, session, chamber, source_url, line):
        if line:
            (type, combined_id, number, title,
             relating_to) = line.split("\xe4")
            if (type == 'HB'
                    and chamber == 'lower') or (type == 'SB'
                                                and chamber == 'upper'):
                #
                # basic bill info
                bill_id = "%s %s" % (type, number.zfill(4))
                bill = Bill(session, chamber, bill_id, title)
                bill.add_source(source_url)

                #
                # add actions
                if self.actionsByBill.has_key(bill_id):
                    for a in self.actionsByBill[bill_id]:
                        bill.add_action(a['actor'], a['action'], a['date'])

                if self.load_versions_sponsors:
                    # add versions and sponsors
                    versionsSponsors = self.versionsSponsorsParser.fetch_and_parse(
                        self, session, bill_id)
                    #print "versionsSponsors: %s" % str(versionsSponsors)
                    if versionsSponsors:
                        for ver in versionsSponsors['versions']:
                            bill.add_version(ver['name'], ver['url'])
                        sponsorType = 'primary'
                        if len(versionsSponsors['sponsors']) > 1:
                            sponsorType = 'cosponsor'
                        for name in versionsSponsors['sponsors']:
                            bill.add_sponsor(sponsorType, name)

                # save - writes out JSON
                self.save_bill(bill)
Exemplo n.º 4
0
    def scrape_bill_info(self, chamber, session):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        page = self.urlopen(info_url)
        page = csv.DictReader(StringIO.StringIO(page))

        abbrev = {'upper': 'S', 'lower': 'H'}[chamber]

        for row in page:
            bill_id = row['bill_num']
            if not bill_id[0] == abbrev:
                continue

            if re.match(r'^(S|H)J', bill_id):
                bill_type = 'joint resolution'
            elif re.match(r'^(S|H)R', bill_id):
                bill_type = 'resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session, chamber, bill_id,
                        row['bill_title'].decode('latin-1'),
                        type=bill_type)
            bill.add_source(info_url)

            self.scrape_bill_page(bill)

            for introducer in self._introducers[bill_id]:
                bill.add_sponsor('introducer', introducer)

            bill['subjects'] = self._subjects[bill_id]

            self.bills[bill_id] = bill
Exemplo n.º 5
0
    def get_bill_information(self, bill_id, chamber, session):
        with self.urlopen(BILL_INFO_URL, 'POST', body="hListBills=" + bill_id) as bill_info_page:
            self.log("Got bill info")
            page = lxml.html.fromstring(bill_info_page)

            # TODO: check whether page is error page and raise custom exception defined above

            bs = page.xpath('//div/b')
            for b in bs:
                containing_div = b.getparent()
                if b.text == "BY":
                    l = containing_div.text_content().strip(u'BY\xa0').split(',')
                    sponsors = map(lambda x: x.strip(' '), l)
                if b.text.strip(u',\xa0') == "ENTITLED":
                    title = containing_div.text_content().lstrip(u'ENTITLED,\xa0')

            divs = page.xpath('//div')
            bill_type = ""
            for div in divs:
                text = div.text_content()
                for ind, reg in enumerate(self.type_regs):
                    if reg.match(text):
                        bill_type = self.bill_types[ind]

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            for ind, sponsor in enumerate(sponsors):
                if ind == 0:
                    bill.add_sponsor('primary', sponsor)
                else:
                    bill.add_sponsor('cosponsor', sponsor)
        return bill
Exemplo n.º 6
0
    def get_bill_information(self, bill_id, chamber, session):
        with self.urlopen(BILL_INFO_URL, 'POST',
                          body="hListBills=" + bill_id) as bill_info_page:
            self.log("Got bill info")
            page = lxml.html.fromstring(bill_info_page)

            # TODO: check whether page is error page and raise custom exception defined above

            bs = page.xpath('//div/b')
            for b in bs:
                containing_div = b.getparent()
                if b.text == "BY":
                    l = containing_div.text_content().strip(u'BY\xa0').split(
                        ',')
                    sponsors = map(lambda x: x.strip(' '), l)
                if b.text.strip(u',\xa0') == "ENTITLED":
                    title = containing_div.text_content().lstrip(
                        u'ENTITLED,\xa0')

            divs = page.xpath('//div')
            bill_type = ""
            for div in divs:
                text = div.text_content()
                for ind, reg in enumerate(self.type_regs):
                    if reg.match(text):
                        bill_type = self.bill_types[ind]

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            for ind, sponsor in enumerate(sponsors):
                if ind == 0:
                    bill.add_sponsor('primary', sponsor)
                else:
                    bill.add_sponsor('cosponsor', sponsor)
        return bill
Exemplo n.º 7
0
    def scrape2009(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Bill
            name = page.cssselect('#legislation h1')[0].text_content().strip()

            bill_id = name.split(' - ')[0].strip()

            bill = Bill(session, chamberName, bill_id, name)

            # Sponsorships
            for a in page.cssselect("#sponsors a"):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in page.cssselect('#history tr')[1:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()

                if '/' not in date:
                    continue

                date = datetime.datetime.strptime(date, '%m/%d/%Y')

                if action_text.startswith('Senate'):
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in page.cssselect('#versions a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')))

            self.save_bill(bill)
Exemplo n.º 8
0
    def scrape_bill(self, session, chamber, bill_type, bill_url):
        with self.urlopen(bill_url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(bill_url)

            # split "SB1 SD2 HD2" to get SB1
            bill_id = page.xpath('//a[@class="headerlink"]')[0].text.split()[0]

            table = page.xpath('//table[@cellspacing="4px"]')[0]

            title = get_table_text(table, "Measure Title")
            subjects = get_table_text(table, "Report Title").split('; ')
            description = get_table_text(table, "Description")
            sponsors = get_table_text(table, "Introducer(s)")

            bill = Bill(session,
                        chamber,
                        bill_id,
                        title,
                        subjects=subjects,
                        type=bill_type,
                        description=description)
            for sponsor in sponsors.split(', '):
                if sponsor.endswith(' (BR)'):
                    sponsor = sponsor[:-5]
                bill.add_sponsor('primary', sponsor)

            # actions
            actions = []

            table = page.xpath('//table[contains(@id, "GridView1")]')[0]
            for row in table.xpath('tr'):
                action_params = {}
                cells = row.xpath('td')
                if len(cells) == 3:
                    ch = cells[1].xpath('font')[0].text
                    action_params['actor'] = house[ch]
                    action_params['action'] = cells[2].xpath('font')[0].text
                    action_date = cells[0].xpath('font')[0].text
                    action_params['date'] = datetime.strptime(
                        action_date, "%m/%d/%Y")
                    action_params['type'] = categorize_action(
                        action_params['action'])
                    actions.append(action_params)
            for action_params in actions:
                bill.add_action(**action_params)

                self.parse_vote(bill, action_params['action'],
                                action_params['actor'], action_params['date'])

            # Add version document if not on a javascript link.
            try:
                bill_version = page.xpath(
                    '//a[contains(@id, "HyperLinkPDF")]')[0].attrib['href']
                bill.add_version('Current version', bill_version)
            except IndexError:  # href not found.
                pass

        bill.add_source(bill_url)
        self.save_bill(bill)
Exemplo n.º 9
0
    def scrape_bill(self, chamber, session, doc_type, url):
        doc = self.url_to_doc(url)
        # bill id, title, synopsis
        bill_num = re.findall('DocNum=(\d+)', url)[0]
        bill_type = DOC_TYPES[doc_type[1:]]
        bill_id = doc_type + bill_num

        title = doc.xpath(
            '//span[text()="Short Description:"]/following-sibling::span[1]/text()'
        )[0].strip()
        synopsis = doc.xpath(
            '//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/text()'
        )[0].strip()

        bill = Bill(session,
                    chamber,
                    bill_id,
                    title,
                    type=bill_type,
                    synopsis=synopsis)

        bill.add_source(url)
        # sponsors
        sponsor_list = build_sponsor_list(doc.xpath('//a[@class="content"]'))
        # don't add just yet; we can make them better using action data

        # actions
        action_tds = doc.xpath(
            '//a[@name="actions"]/following-sibling::table[1]/td')
        for date, actor, action in group(action_tds, 3):
            date = datetime.datetime.strptime(date.text_content().strip(),
                                              "%m/%d/%Y")
            actor = actor.text_content()
            if actor == 'House':
                actor = 'lower'
            elif actor == 'Senate':
                actor = 'upper'

            action = action.text_content()
            bill.add_action(actor, action, date, **_categorize_action(action))
            if action.lower().find('sponsor') != -1:
                self.refine_sponsor_list(actor, action, sponsor_list, bill_id)

        # now add sponsors
        for spontype, sponsor, chamber in sponsor_list:
            if chamber:
                bill.add_sponsor(spontype, sponsor, chamber=chamber)
            else:
                bill.add_sponsor(spontype, sponsor)

        # versions
        version_url = doc.xpath('//a[text()="Full Text"]/@href')[0]
        self.scrape_documents(bill, version_url)

        # if there's more than 1 votehistory link, there are votes to grab
        if len(doc.xpath('//a[contains(@href, "votehistory")]')) > 1:
            votes_url = doc.xpath('//a[text()="Votes"]/@href')[0]
            self.scrape_votes(session, bill, votes_url)

        self.save_bill(bill)
Exemplo n.º 10
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt"
        page = self.urlopen(url).decode("latin-1")
        page = unicode_csv_reader(StringIO.StringIO(page), delimiter="|")

        for row in page:
            bill_chamber = {"H": "lower", "S": "upper"}[row[0]]
            if bill_chamber != chamber:
                continue

            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r"(H|S)([A-Z]+)\s", bill_id).group(2)
            bill_type = {
                "B": "bill",
                "R": "resolution",
                "JR": "joint resolution",
                "CR": "concurrent resolution",
                "MR": "memorial resolution",
                "CMR": "concurrent memorial resolution",
            }[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(session, chamber, bill_id, row[3], type=bill_type)
            bill.add_source(url)
            bill.add_sponsor("lead sponsor", row[11])

            version_url = "ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/%s.pdf" % (session, bill_id.replace(" ", ""))
            bill.add_version(bill_id, version_url)

            self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
Exemplo n.º 11
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == 'upper':
            other_chamber = 'lower'
            bill_id = 'SB 1'
        else:
            other_chamber = 'upper'
            bill_id = 'HB 1'

        b1 = Bill(session, chamber, bill_id, 'A super bill')
        b1.add_source('http://example.com/')
        b1.add_version('As Introduced', 'http://example.com/SB1.html')
        b1.add_document('Google', 'http://google.com')
        b1.add_sponsor('primary', 'Bob Smith')
        b1.add_sponsor('secondary', 'Johnson, Sally')

        d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y')
        v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0)
        v1.yes('Smith')
        v1.yes('Johnson')

        d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y')
        v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1)
        v2.no('Bob Smith')
        v2.other('S. Johnson')

        b1.add_vote(v1)
        b1.add_vote(v2)

        b1.add_action(chamber, 'introduced', d1)
        b1.add_action(chamber, 'read first time', d2)
        b1.add_action(other_chamber, 'introduced', d2)

        self.save_bill(b1)
Exemplo n.º 12
0
    def scrape_bill_info(self, chamber, session):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        page = self.urlopen(info_url)
        page = csv.DictReader(StringIO.StringIO(page))

        abbrev = {'upper': 'S', 'lower': 'H'}[chamber]

        for row in page:
            bill_id = row['bill_num']
            if not bill_id[0] == abbrev:
                continue

            if re.match(r'^(S|H)J', bill_id):
                bill_type = 'joint resolution'
            elif re.match(r'^(S|H)R', bill_id):
                bill_type = 'resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session,
                        chamber,
                        bill_id,
                        row['bill_title'],
                        type=bill_type)
            bill.add_source(info_url)

            self.scrape_bill_page(bill)

            for introducer in self._introducers[bill_id]:
                bill.add_sponsor('introducer', introducer)

            self.bills[bill_id] = bill
Exemplo n.º 13
0
    def scrape_bill(self, chamber, bill):
        bill_id = bill['id'].replace('w/','with ')

        page = lxml.html.fromstring(self.urlopen(bill['url']))
        page.make_links_absolute(bill['url'])

        title_row = page.xpath('//tr[td/b[contains(font,"Long Title")]]')[0]
        # text_content() == make sure any tags in the title don't cause issues
        title = title_row.xpath('td[@width="79%"]/font')[0].text_content() 

        # now we can create a bill object
        b = Bill(bill['session'], bill['chamber'], bill_id, title)
        b.add_source(bill['url'])

        sponsors_row = page.xpath('//tr[td/b[contains(font,"Primary Sponsor")]]')[0]
        sponsor = sponsors_row.xpath('td[@width="31%"]/font')[0].text

        if sponsor != None:
            b.add_sponsor('primary', sponsor)

        # scraping these and co-sponsors, but not doing anything with them until 
        # it's decided whether or not to attempt to split 'em up
        additional = sponsors_row.xpath('td[@width="48%"]/font')
        additional_sponsors = additional[0].text if len(additional) > 0 else ""
        additional_sponsors = additional_sponsors.replace('&nbsp&nbsp&nbsp','')

        cosponsors_row = page.xpath('//tr[td/b[contains(font,"CoSponsors")]]')[0]
        cosponsors = cosponsors_row.xpath('td[@width="79%"]/font')[0].text
        cosponsors = cosponsors if cosponsors != '{ NONE...}' else ''

        introduced_row = page.xpath('//tr[td/b[contains(font,"Introduced On")]]')
        if len(introduced_row) > 0:
            introduced = introduced_row[0].expath('/td[@width="31%"]/font')[0].text
            introduced = datetime.strptime(introduced, '%b %d, %Y')
            b.add_action(bill['chamber'], 'introduced', introduced, 'bill:introduced')

        actions = page.xpath('//table[preceding-sibling::b[contains(font,"Actions History:")]]/tr/td[@width="79%"]/font')
        if len(actions) > 0:
           actions = actions[0].text_content().split('\n') 
           for act in actions:
               act = act.partition(' - ')
               date = datetime.strptime(act[0], '%b %d, %Y')
               b.add_action(bill['chamber'], act[2], date)
        
        # resources = page.xpath('//tr[td/b[contains(font, "Full text of Legislation")]]')

        # save vote urls for scraping later
        vote_urls = []
        voting_reports = page.xpath('//tr[td/b[contains(font, "Voting Reports")]]')
        if(len(voting_reports) > 0):
            for report in voting_reports[0].xpath('td/font/a'):
                vote_urls.append(report.attrib['href'])
        
        # Scrape votes
        for url in vote_urls:
            vote = self.scrape_votes(chamber, title, bill_id, url)
            b.add_vote(vote)

        # Save bill
        self.save_bill(b)
Exemplo n.º 14
0
    def scrape2009(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Bill
            name = page.cssselect('#legislation h1')[0].text_content().strip()

            bill_id = name.split(' - ')[0].strip()

            bill = Bill(session, chamberName, bill_id, name)

            # Sponsorships
            for a in page.cssselect("#sponsors a"):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in page.cssselect('#history tr')[1:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()

                if '/' not in date:
                    continue

                date = datetime.datetime.strptime(date, '%m/%d/%Y')

                if action_text.startswith('Senate'):
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in page.cssselect('#versions a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')))

            self.save_bill(bill)
Exemplo n.º 15
0
    def scrape_regular_row(self, chamber, session, row):
        """Returns bill attributes from row."""
        params = {}
        params['session'] = session
        params['chamber'] = chamber

        b = row.xpath('td/font/a[contains(@id, "HyperLink1")]')
        if b: # Ignore if no match
            bill_status_url = b[0].attrib['href']
            bill_url = row.xpath('td/font/span[contains(@id, "_Label2")]')[0].text
            params['bill_id'] = b[0].xpath('font')[0].text.split()[0]
            params['title'] = row.xpath('td/font/span[contains(@id, "_Label1")]/u/font')[0].text
            subject = row.xpath('td/font/span[contains(@id, "_Label6")]')[0].text
            subject = subject.replace('RELATING TO ', '') # Remove lead text
            params['subjects'] = [subject.replace('.', '')]
            params['description'] = row.xpath('td/font/span[contains(@id, "_Label2")]')[0].text
            sponsors = row.xpath('td/font/span[contains(@id, "_Label7")]')[0].text
            params['companion'] = row.xpath('td/font/span[contains(@id, "_Label8")]')[0].text
            bill = Bill(**params)
            for sponsor in sponsors.split(', '):
                bill.add_sponsor('primary', sponsor)
            actions = self.scrape_actions(bill, bill_status_url)
            bill.add_source(bill_status_url)
            self.save_bill(bill)
        return
Exemplo n.º 16
0
    def parse_senate_billpage(self, bill_url, year):
        with self.urlopen(bill_url) as bill_page:
            bill_page = BeautifulSoup(bill_page)
            # get all the info needed to record the bill
            bill_id = bill_page.find(id="lblBillNum").b.font.contents[0]
            bill_title = bill_page.find(id="lblBillTitle").font.string
            bill_desc = bill_page.find(id="lblBriefDesc").font.contents[0]
            bill_lr = bill_page.find(id="lblLRNum").font.string

            bill = Bill(year, 'upper', bill_id, bill_desc, bill_url=bill_url,
                        bill_lr=bill_lr, official_title=bill_title)
            bill.add_source(bill_url)

            # Get the primary sponsor
            bill_sponsor = bill_page.find(id="hlSponsor").i.font.contents[0]
            bill_sponsor_link = bill_page.find(id="hlSponsor").href
            bill.add_sponsor('primary', bill_sponsor,
                             sponsor_link=bill_sponsor_link)

            # cosponsors show up on their own page, if they exist
            cosponsor_tag = bill_page.find(id="hlCoSponsors")
            if cosponsor_tag and 'href' in cosponsor_tag:
                self.parse_senate_cosponsors(bill, cosponsor_tag['href'])

            # get the actions
            action_url = bill_page.find(id="hlAllActions")['href']
            self.parse_senate_actions(bill, action_url)

            # stored on a separate page
            versions_url = bill_page.find(id="hlFullBillText")
            if versions_url:
                self.parse_senate_bill_versions(bill, versions_url['href'])

        self.save_bill(bill)
Exemplo n.º 17
0
    def scrape_2009RS_row(self, chamber, session, row):
        """Returns bill attributes from row."""
        params = {}
        params['session'] = session
        params['chamber'] = chamber

        b = row.xpath('td/font/a[contains(@id, "HyperLink1")]')
        if b:  # Ignore if no match
            bill_status_url = b[0].attrib['href']
            bill_url = row.xpath(
                'td/font/span[contains(@id, "_Label2")]')[0].text
            params['bill_id'] = b[0].xpath('font')[0].text
            params['title'] = row.xpath(
                'td/font/span[contains(@id, "_Label1")]/u/font')[0].text
            subject = row.xpath(
                'td/font/span[contains(@id, "_Label6")]')[0].text
            subject = subject.replace('RELATING TO ', '')  # Remove lead text
            params['subject'] = subject.replace('.', '')
            params['description'] = row.xpath(
                'td/font/span[contains(@id, "_Label2")]')[0].text
            sponsors = row.xpath(
                'td/font/span[contains(@id, "_Label7")]')[0].text
            params['companion'] = row.xpath(
                'td/font/span[contains(@id, "_Label8")]')[0].text
            bill = Bill(**params)
            bill.add_sponsor('primary', sponsors)
            actions = self.scrape_actions(bill, bill_status_url)
            bill.add_source(bill_status_url)
            self.save_bill(bill)
        return
Exemplo n.º 18
0
    def scrape_current(self, chamber, term):
        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        with self.urlopen(ksapi.url + 'bill_status/') as bill_request:
            bill_request_json = json.loads(bill_request)
            bills = bill_request_json['content']
            for bill_data in bills:

                bill_id = bill_data['BILLNO']

                # filter other chambers
                if not bill_id.startswith(chamber_letter):
                    continue

                if 'CR' in bill_id:
                    btype = 'concurrent resolution'
                elif 'R' in bill_id:
                    btype = 'resolution'
                elif 'B' in bill_id:
                    btype = 'bill'

                # main
                bill = Bill(term, chamber, bill_id, bill_data['SHORTTITLE'],
                            type=btype, status=bill_data['STATUS'])
                bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

                if bill_data['LONGTITLE']:
                    bill.add_title(bill_data['LONGTITLE'])

                for sponsor in bill_data['SPONSOR_NAMES']:
                    stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1
                             else 'cosponsor')
                    bill.add_sponsor(stype, sponsor)

                # history is backwards
                for event in reversed(bill_data['HISTORY']):

                    actor = ('upper' if event['chamber'] == 'Senate'
                             else 'lower')

                    date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S")
                    # append committee names if present
                    if 'committee_names' in event:
                        action = (event['status'] + ' ' +
                                  ' and '.join(event['committee_names']))
                    else:
                        action = event['status']

                    if event['action_code'] not in ksapi.action_codes:
                        self.warning('unknown action code on %s: %s %s' %
                                     (bill_id, event['action_code'],
                                      event['status']))
                        atype = 'other'
                    else:
                        atype = ksapi.action_codes[event['action_code']]
                    bill.add_action(actor, action, date, type=atype)

                self.scrape_html(bill)
                self.save_bill(bill)
Exemplo n.º 19
0
    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        if chamber == 'legislature':
            chamber = 'upper'
        bill = Bill(data['legislative_session'], chamber, data['identifier'],
                    data['title'], subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(action['organization_id'])['classification']
            legislators = []
            committees = []
            for rel in action['related_entities']:
                if rel['entity_type'] == 'organization':
                    committees.append(rel['name'])
                elif rel['entity_type'] == 'person':
                    legislators.append(rel['name'])
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']),
                            committees=committees,
                            legislators=legislators,
                            **action.get('extras', {}),
                            )

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(sponsor['classification'],
                             sponsor['name'],
                             )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'], link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']),
                                 **version.get('extras', {}))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'], link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']),
                                  **doc.get('extras', {}))

        for title in data['other_titles']:
            bill.add_title(title['title'])

        for related in data['related_bills']:
            bill.add_companion(related['identifier'],
                               related['legislative_session'],
                               chamber
                               )
        self.save_bill(bill)
Exemplo n.º 20
0
    def scrape1999(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm"
        with self.lxml_context(url) as lxml:
            # Grab the interesting tables on the page.
            tables = page.cssselect('table')

            # Bill
            name = tables[1].cssselect('a')[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version('Current', url.replace('/sum/', '/fulltext/'))

            # Sponsorships
            for a in tables[2].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in tables[-1].cssselect('tr'):
                senate_date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                house_date = row[2].text_content().strip()
                if '/' not in senate_date and '/' not in house_date:
                    continue
                if senate_date:
                    bill.add_action('upper', action_text, senate_date)
                if house_date:
                    bill.add_action('lower', action_text, house_date)

            self.save_bill(bill)
Exemplo n.º 21
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt"
        page = self.urlopen(url).decode('latin-1')
        page = unicode_csv_reader(StringIO.StringIO(page), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]
            if bill_chamber != chamber:
                continue

            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial resolution',
                'CMR': 'concurrent memorial resolution'}[type_spec]

            bill = Bill('2011', chamber, bill_id, row[3], type=bill_type)
            bill.add_source(url)
            bill.add_sponsor('lead sponsor', row[11])

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/%s.pdf" % (
                               session, bill_id.replace(' ', '')))
            bill.add_version(bill_id, version_url)

            self.scrape_votes(bill)

            self.bills[bill_id] = bill
Exemplo n.º 22
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.urlopen(url))
        except scrapelib.HTTPError as e:
            self.warning("error (%s) fetching %s, skipping" % (e, url))
            return

        title = page.xpath("string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()

        if "JR" in bill_id:
            bill_type = ["joint resolution"]
        elif "CR" in bill_id:
            bill_type = ["concurrent resolution"]
        elif "R" in bill_id:
            bill_type = ["resolution"]
        else:
            bill_type = ["bill"]

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill["subjects"] = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if "otherAuth" in link.attrib["id"]:
                bill.add_sponsor("coauthor", name)
            else:
                bill.add_sponsor("author", name)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == "None":
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == "H":
                actor = "lower"
            elif actor == "S":
                actor = "upper"

            bill.add_action(actor, action, date, type=action_type(action))

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        for link in version_table.xpath(".//a[contains(@href, '.DOC')]"):
            version_url = link.attrib["href"]
            if "COMMITTEE REPORTS" in version_url:
                continue

            name = link.text.strip()
            bill.add_version(name, version_url)

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            self.scrape_votes(bill, urlescape(link.attrib["href"]))

        self.save_bill(bill)
Exemplo n.º 23
0
    def scrape2003(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Grab the interesting tables on the page.
            tables = page.cssselect('center table')

            # Bill
            name = tables[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Sponsorships
            for a in tables[1].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            center = page.cssselect('center table center')[0]

            for row in center.cssselect('table')[-2].cssselect('tr')[2:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                if '/' not in date:
                    continue
                if action_text.startswith('Senate'):
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in center.cssselect('table')[-1].cssselect('a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')))

            self.save_bill(bill)
Exemplo n.º 24
0
    def scrape_bill_info(self, chamber, session):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        page = self.urlopen(info_url)
        page = csv.DictReader(StringIO.StringIO(page))

        abbrev = {"upper": "S", "lower": "H"}[chamber]

        for row in page:
            bill_id = row["bill_num"]
            if not bill_id[0] == abbrev:
                continue

            if re.match(r"^(S|H)J", bill_id):
                bill_type = "joint resolution"
            elif re.match(r"^(S|H)R", bill_id):
                bill_type = "resolution"
            else:
                bill_type = "bill"

            bill = Bill(session, chamber, bill_id, row["bill_title"].decode("latin-1"), type=bill_type)
            bill.add_source(info_url)

            self.scrape_bill_page(bill)

            for introducer in self._introducers[bill_id]:
                bill.add_sponsor("introducer", introducer)

            bill["subjects"] = self._subjects[bill_id]

            self.bills[bill_id] = bill
Exemplo n.º 25
0
    def scrape2001(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2001_02/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Grab the interesting tables on the page.
            tables = page.cssselect("table center table")

            # Bill
            name = tables[0].text_content().split("-", 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Sponsorships
            for a in tables[1].cssselect("a"):
                bill.add_sponsor("", a.text_content().strip())

            # Actions
            center = page.cssselect("table center")[-1]

            for row in center.cssselect("table table")[0].cssselect("tr")[2:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                if "/" not in date:
                    continue
                if action_text.startswith("Senate"):
                    action_text = action_text.split(" ", 1)[1].strip()
                    bill.add_action("upper", action_text, date)
                elif action_text.startswith("House"):
                    action_text = action_text.split(" ", 1)[1].strip()
                    bill.add_action("lower", action_text, date)

            # Versions
            for row in center.cssselect("table table")[1].cssselect("a"):
                bill.add_version(a.text_content(), urlparse.urljoin(url, a.get("href")))

            self.save_bill(bill)
Exemplo n.º 26
0
    def _parse_bill(self, session, chamber, source_url, line):
        if line:
            (type, combined_id, number, title, relating_to) = line.split("\xe4")
            if (type == 'HB' and chamber == 'lower') or (type == 'SB' and chamber == 'upper'):
                #
                # basic bill info
                bill_id = "%s %s" % (type, number.zfill(4))
                bill = Bill(session, chamber, bill_id, title)
                bill.add_source(source_url)

                #
                # add actions
                if self.actionsByBill.has_key(bill_id):
                    for a in self.actionsByBill[bill_id]:
                        bill.add_action(a['actor'], a['action'], a['date'])

                if self.load_versions_sponsors:
                    # add versions and sponsors
                    versionsSponsors = self.versionsSponsorsParser.fetch_and_parse(self, session, bill_id)
                    #print "versionsSponsors: %s" % str(versionsSponsors)
                    if versionsSponsors:
                        for ver in versionsSponsors['versions']:
                            bill.add_version(ver['name'], ver['url'])
                        sponsorType = 'primary'
                        if len(versionsSponsors['sponsors']) > 1:
                            sponsorType = 'cosponsor'
                        for name in versionsSponsors['sponsors']:
                            bill.add_sponsor(sponsorType, name)

                # save - writes out JSON
                self.save_bill(bill)
Exemplo n.º 27
0
    def scrape2003(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Grab the interesting tables on the page.
            tables = page.cssselect('center table')

            # Bill
            name = tables[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Sponsorships
            for a in tables[1].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            center = page.cssselect('center table center')[0]

            for row in center.cssselect('table')[-2].cssselect('tr')[2:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                if '/' not in date:
                    continue
                if action_text.startswith('Senate'):
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in center.cssselect('table')[-1].cssselect('a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')))

            self.save_bill(bill)
Exemplo n.º 28
0
    def scrape_bill(self, chamber, session, doc_type, url, bill_type=None):
        try:
            doc = self.lxmlize(url)
        except scrapelib.HTTPError as e:
            assert '500' in e.args[0], "Unexpected error when accessing page: {}".format(e)
            self.warning("500 error for bill page; skipping bill")
            return

        # bill id, title, summary
        bill_num = re.findall('DocNum=(\d+)', url)[0]
        bill_type = bill_type or DOC_TYPES[doc_type[1:]]
        bill_id = doc_type + bill_num

        title = doc.xpath('//span[text()="Short Description:"]/following-sibling::span[1]/text()')[0].strip()
        summary = doc.xpath('//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/text()')[0].strip()

        bill = Bill(session, chamber, bill_id, title, type=bill_type,
                    summary=summary)

        bill.add_source(url)
        # sponsors
        sponsor_list = build_sponsor_list(doc.xpath('//a[@class="content"]'))
        # don't add just yet; we can make them better using action data

        # actions
        action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td')
        for date, actor, action in group(action_tds, 3):
            date = datetime.datetime.strptime(date.text_content().strip(),
                                              "%m/%d/%Y")
            actor = actor.text_content()
            if actor == 'House':
                actor = 'lower'
            elif actor == 'Senate':
                actor = 'upper'

            action = action.text_content()
            bill.add_action(actor, action, date,
                            **_categorize_action(action))
            if action.lower().find('sponsor') != -1:
                self.refine_sponsor_list(actor, action, sponsor_list, bill_id)

        # now add sponsors
        for spontype, sponsor, chamber, official_type in sponsor_list:
            if chamber:
                bill.add_sponsor(spontype, sponsor,
                                 official_type=official_type, chamber=chamber)
            else:
                bill.add_sponsor(spontype, sponsor,
                                 official_type=official_type)

        # versions
        version_url = doc.xpath('//a[text()="Full Text"]/@href')[0]
        self.scrape_documents(bill, version_url)

        # if there's more than 1 votehistory link, there are votes to grab
        if len(doc.xpath('//a[contains(@href, "votehistory")]')) > 1:
            votes_url = doc.xpath('//a[text()="Votes"]/@href')[0]
            self.scrape_votes(session, bill, votes_url)

        self.save_bill(bill)
Exemplo n.º 29
0
    def scrape1999(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm"
        with self.lxml_context(url) as lxml:
            # Grab the interesting tables on the page.
            tables = page.cssselect("table")

            # Bill
            name = tables[1].cssselect("a")[0].text_content().split("-", 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version("Current", url.replace("/sum/", "/fulltext/"))

            # Sponsorships
            for a in tables[2].cssselect("a"):
                bill.add_sponsor("", a.text_content().strip())

            # Actions
            for row in tables[-1].cssselect("tr"):
                senate_date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                house_date = row[2].text_content().strip()
                if "/" not in senate_date and "/" not in house_date:
                    continue
                if senate_date:
                    bill.add_action("upper", action_text, senate_date)
                if house_date:
                    bill.add_action("lower", action_text, house_date)

            self.save_bill(bill)
Exemplo n.º 30
0
    def scrape1999(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm"
        with self.lxml_context(url) as lxml:
            # Grab the interesting tables on the page.
            tables = page.cssselect('table')

            # Bill
            name = tables[1].cssselect('a')[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version('Current', url.replace('/sum/', '/fulltext/'))

            # Sponsorships
            for a in tables[2].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in tables[-1].cssselect('tr'):
                senate_date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                house_date = row[2].text_content().strip()
                if '/' not in senate_date and '/' not in house_date:
                    continue
                if senate_date:
                    bill.add_action('upper', action_text, senate_date)
                if house_date:
                    bill.add_action('lower', action_text, house_date)

            self.save_bill(bill)
Exemplo n.º 31
0
    def scrape_bills(self, chamber, session, subjects):
        idex = START_IDEX[chamber]
        FROM = "ctl00$rilinContent$txtBillFrom"
        TO = "ctl00$rilinContent$txtBillTo"
        YEAR = "ctl00$rilinContent$cbYear"
        blocks = "FOO"  # Ugh.
        while len(blocks) > 0:
            default_headers = get_default_headers(SEARCH_URL)
            default_headers[FROM] = idex
            default_headers[TO] = idex + MAXQUERY
            default_headers[YEAR] = session
            idex += MAXQUERY
            #headers = urllib.urlencode( default_headers )
            blocks = self.parse_results_page(
                self.urlopen(SEARCH_URL, method="POST", body=default_headers))
            blocks = blocks[1:-1]
            blocks = self.digest_results_page(blocks)

            for block in blocks:
                bill = blocks[block]
                subs = []
                try:
                    subs = subjects[bill['bill_id']]
                except KeyError:
                    pass

                title = bill['title'][len("ENTITLED, "):]
                billid = bill['bill_id']
                try:
                    subs = subjects[bill['bill_id']]
                except KeyError:
                    subs = []

                for b in BILL_NAME_TRANSLATIONS:
                    if billid[:len(b)] == b:
                        billid = BILL_NAME_TRANSLATIONS[b] + \
                            billid[len(b)+1:].split()[0]

                b = Bill(session,
                         chamber,
                         billid,
                         title,
                         type=self.get_type_by_name(bill['bill_id']),
                         subjects=subs)

                self.process_actions(bill['actions'], b)
                sponsors = bill['sponsors'][len("BY"):].strip()
                sponsors = sponsors.split(",")
                sponsors = [s.strip() for s in sponsors]

                for href in bill['bill_id_hrefs']:
                    b.add_version(href.text,
                                  href.attrib['href'],
                                  mimetype="application/pdf")

                for sponsor in sponsors:
                    b.add_sponsor("primary", sponsor)

                b.add_source(SEARCH_URL)
                self.save_bill(b)
Exemplo n.º 32
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            # search for Titulo, accent over i messes up lxml, so use 'tulo'
            title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()')
            if not title:
                raise NoSuchBill()
            bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
            author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0]
            bill.add_sponsor('primary', author.strip())

            action_table = doc.xpath('//table')[-1]
            for row in action_table[1:]:
                tds = row.xpath('td')

                # ignore row missing date
                if len(tds) != 2:
                    continue


                date = datetime.datetime.strptime(tds[0].text_content(),
                                                  "%m/%d/%Y")
                action = tds[1].text_content()
                bill.add_action(chamber, action, date)

                # also has an associated version
                if tds[1].xpath('a'):
                    bill.add_version(action, tds[1].xpath('a/@href')[0])

            bill.add_source(url)
            self.save_bill(bill)
Exemplo n.º 33
0
    def scrape(self, chamber, session):
        year = year_from_session(session)
        url = bills_url(year)
        with self.urlopen(url) as bills_page_html:
            bills_page = lxml.html.fromstring(bills_page_html)
            table_rows = bills_page.cssselect('tr')
            # Eliminate empty rows
            table_rows = table_rows[0:len(table_rows):2]
            for row in table_rows:
                row_elements = row.cssselect('td')

                bill_document = row_elements[0]
                bill_document.make_links_absolute(BASE_URL)

                element, attribute, link, pos = bill_document.iterlinks().next(
                )
                bill_id = element.text_content().rstrip('.pdf')
                bill_document_link = link

                title_and_sponsors = row_elements[1]
                title_match = re.search('([A-Z][a-z]+.+[a-z])[A-Z]',
                                        title_and_sponsors.text_content())
                sponsors_match = re.search('[a-z]([A-Z]+.+)',
                                           title_and_sponsors.text_content())
                title = title_match.group(1)
                sponsors = sponsors_match.group(1)
                separated_sponsors = sponsors.split('--')

                bill = Bill(session, chamber, bill_id, title)
                bill.add_version('current', bill_document_link)

                if separated_sponsors[1] == '(NONE)':
                    bill.add_sponsor('primary', separated_sponsors[0])

                else:
                    bill.add_sponsor('cosponsor', separated_sponsors[0])
                    bill.add_sponsor('cosponsor', separated_sponsors[1])

                versions_page_element = row_elements[2]
                versions_page_element.make_links_absolute(BASE_URL)
                element, attribute, link, pos = versions_page_element.iterlinks(
                ).next()

                bill.add_source(link)

                self.scrape_versions(link, bill)

                actions_page_element = row_elements[3]
                element, attribute, link, pos = actions_page_element.iterlinks(
                ).next()
                frame_link = BASE_URL + link.split('?Open&target=')[1]

                self.scrape_actions(frame_link, bill)

                votes_page_element = row_elements[7]
                element, attribute, link, pos = votes_page_element.iterlinks(
                ).next()
                frame_link = BASE_URL + link.split('?Open&target=')[1]
                self.scrape_votes(frame_link, chamber, bill)
Exemplo n.º 34
0
    def scrape_assem_bills(self, chamber, insert, session, year):

        doc_type = {
            1: 'bill',
            3: 'resolution',
            5: 'concurrent resolution',
            6: 'joint resolution'
        }
        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (
                insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (
                    insert, link)
                page = self.urlopen(page_path)
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)

                bill_id = root.xpath(
                    'string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)'
                )
                title = root.xpath(
                    'string(/html/body/div[@id="content"]/table[1]/tr[5]/td)')

                bill = Bill(session, chamber, bill_id, title, type=bill_type)
                bill['subjects'] = self.subject_mapping[bill_id]
                bill_text = root.xpath(
                    "string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)"
                )
                text_url = "http://www.leg.state.nv.us" + bill_text
                bill.add_version("Bill Text",
                                 text_url,
                                 mimetype='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsor('primary', leg)
                for leg in secondary:
                    bill.add_sponsor('cosponsor', leg)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes = mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[
                        1] + minutes_date[2] + " Minutes"
                    bill.add_document(minutes_date, minutes_url)
                    minutes_count = minutes_count + 1

                self.scrape_actions(root, bill, "lower")
                self.scrape_votes(page, bill, insert, year)
                bill.add_source(page_path)
                self.save_bill(bill)
Exemplo n.º 35
0
    def get_bill_info(self, chamber, session, bill_detail_url, version_list_url):
        """Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        if chamber == "House":
            chamber = 'lower'
        else:
            chamber = 'upper'

        with self.urlopen(bill_detail_url) as bill_html:
            doc = lxml.html.fromstring(bill_html)

            bill_id = doc.xpath('//title/text()')[0].split()[0]
            bill_title = doc.xpath('//font[@size=-1]/text()')[0]
            bill_type = {'F': 'bill', 'R':'resolution',
                         'C': 'concurrent resolution'}[bill_id[1]]
            bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)
            bill['subjects'] = self._subject_mapping[bill_id]
            bill.add_source(bill_detail_url)

            # grab sponsors
            sponsors = doc.xpath('//table[@summary="Show Authors"]/descendant::a/text()')
            if sponsors:
                primary_sponsor = sponsors[0].strip()
                bill.add_sponsor('primary', primary_sponsor, chamber=chamber)
                cosponsors = sponsors[1:]
                for leg in cosponsors:
                    bill.add_sponsor('cosponsor', leg.strip(), chamber=chamber)

            # Add Actions performed on the bill.
            bill_actions = self.extract_bill_actions(doc, chamber)
            for action in bill_actions:
                kwargs = {}
                if 'committee' in action:
                    kwargs['committees'] = action['committees']

                bill.add_action(action['action_chamber'],
                                action['action_text'],
                                action['action_date'],
                                type=action['action_type'],
                                **kwargs)

        # Get all versions of the bill.
        # Versions of a bill are on a separate page, linked to from the column
        # labeled, "Bill Text", on the search results page.
        with self.urlopen(version_list_url) as version_html:
            if 'resolution' in version_html.response.url:
                bill.add_version('resolution text', version_html.response.url,
                                 mimetype='text/html')
            else:
                version_doc = lxml.html.fromstring(version_html)
                for v in version_doc.xpath('//a[starts-with(@href, "/bin/getbill.php")]'):
                    version_url = urlparse.urljoin(VERSION_URL_BASE,
                                                   v.get('href'),
                                                   mimetype='text/html')
                    bill.add_version(v.text.strip(), version_url)

        self.save_bill(bill)
Exemplo n.º 36
0
    def scrape_bill(self, chamber, session, doc_type, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        # bill id, title, synopsis
        bill_num = re.findall('DocNum=(\d+)', url)[0]
        bill_type = DOC_TYPES[doc_type[1:]]
        bill_id = doc_type + bill_num

        title = doc.xpath(
            '//span[text()="Short Description:"]/following-sibling::span[1]/text()'
        )[0].strip()
        synopsis = doc.xpath(
            '//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/text()'
        )[0].strip()

        bill = Bill(session,
                    chamber,
                    bill_id,
                    title,
                    type=bill_type,
                    synopsis=synopsis)

        # sponsors
        for sponsor in doc.xpath('//a[@class="content"]/text()'):
            bill.add_sponsor('cosponsor', sponsor)

        # actions
        action_tds = doc.xpath(
            '//a[@name="actions"]/following-sibling::table[1]/td')
        for date, actor, action in group(action_tds, 3):
            date = datetime.datetime.strptime(date.text_content().strip(),
                                              "%m/%d/%Y")
            actor = actor.text_content()
            if actor == 'House':
                actor = 'lower'
            elif actor == 'Senate':
                actor = 'upper'

            action = action.text_content()

            bill.add_action(actor,
                            action,
                            date,
                            type=_categorize_action(action))

        # versions
        version_url = doc.xpath('//a[text()="Full Text"]/@href')[0]
        self.scrape_documents(bill, version_url)

        # if there's more than 1 votehistory link, there are votes to grab
        if len(doc.xpath('//a[contains(@href, "votehistory")]')) > 1:
            votes_url = doc.xpath('//a[text()="Votes"]/@href')[0]
            self.scrape_votes(bill, votes_url)
            bill.add_source(votes_url)

        bill.add_source(url)
        self.save_bill(bill)
Exemplo n.º 37
0
    def scrape(self, chamber, session):
        self.site_id = self.metadata['session_details'][session]['internal_id']
        chamber_piece = {
            'upper': 'Senate',
            'lower': 'House+of+Representatives'
        }[chamber]

        # resolutions
        # http://alisondb.legislature.state.al.us/acas/SESSResosBySelectedMatterTransResults.asp?WhichResos=Senate&TransCodes={All}&LegDay={All}%22&GetBillsTrans=Get+Resolutions+by+Transaction

        url = 'http://alisondb.legislature.state.al.us/acas/SESSBillsBySelectedMatterTransResults.asp?TransCodes={All}&LegDay={All}&WhichBills=%s' % chamber_piece

        self.refresh_session()

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            # bills are all their own table with cellspacing=4 (skip first)
            bill_tables = doc.xpath('//table[@cellspacing="4"]')
            for bt in bill_tables[1:]:

                # each table has 3 rows: detail row, description, blank
                details, desc, _ = bt.xpath('tr')

                # first <tr> has img, button, sponsor, topic, current house
                #   current status, committee, committee2, last action
                _, button, sponsor, topic, _, _, com1, com2, _ = details.xpath(
                    'td')

                # pull bill_id out of script tag (gross)
                bill_id = bill_id_re.search(button.text_content()).group()
                oid = btn_re.search(button.text_content()).groups()[0]

                sponsor = sponsor.text_content()
                topic = topic.text_content()
                com1 = com1.text_content()
                com2 = com2.text_content()
                desc = desc.text_content()

                # create bill
                bill = Bill(session,
                            chamber,
                            bill_id,
                            desc.strip(),
                            topic=topic)
                bill.add_sponsor(sponsor, 'primary')

                self.get_sponsors(bill, oid)
                self.get_actions(bill, oid)

                # craft bill URL
                session_fragment = '2010rs'
                type_fragment = 'bills'
                bill_id_fragment = bill_id.lower()
                bill_text_url = 'http://alisondb.legislature.state.al.us/acas/searchableinstruments/%s/%s/%s.htm' % (
                    session_fragment, type_fragment, bill_id_fragment)
                bill.add_version('bill text', bill_text_url)

                self.save_bill(bill)
Exemplo n.º 38
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            # search for Titulo, accent over i messes up lxml, so use 'tulo'
            title = doc.xpath(
                u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()'
            )
            if not title:
                raise NoSuchBill()
            bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
            author = doc.xpath(
                u'//td/b[contains(text(),"Autor")]/../text()')[0]
            for aname in author.split(','):
                bill.add_sponsor('primary', self.clean_name(aname).strip())
            co_authors = doc.xpath(
                u'//td/b[contains(text(),"Co-autor")]/../text()')
            if len(co_authors) != 0:
                for co_author in co_authors[1].split(','):
                    bill.add_sponsor('cosponsor',
                                     self.clean_name(co_author).strip())
            action_table = doc.xpath('//table')[-1]
            for row in action_table[1:]:
                tds = row.xpath('td')
                # ignore row missing date
                if len(tds) != 2:
                    continue
                date = datetime.datetime.strptime(tds[0].text_content(),
                                                  "%m/%d/%Y")
                action = tds[1].text_content().strip()
                #parse the text to see if it's a new version or a unrelated document
                #if has - let's *shrug* assume it's a vote document

                #get url of action
                action_url = tds[1].xpath('a/@href')
                atype, action = self.parse_action(chamber, bill, action,
                                                  action_url, date)
                if atype == 'bill:passed' and action_url:
                    vote_chamber = None
                    for pattern, vote_chamber in _voteChambers:
                        if re.match(pattern, action):
                            break

                    else:
                        self.warning('coudnt find voteChamber pattern')

                    if vote_chamber == 'lower' and len(action_url) > 0:
                        vote = self.scrape_votes(action_url[0], action, date,
                                                 vote_chamber)
                        if not vote[0] == None:
                            vote[0].add_source(action_url[0])
                            bill.add_vote(vote[0])
                        else:
                            self.warning('Problem Reading vote: %s,%s' %
                                         (vote[1], bill_id))

            bill.add_source(url)
            self.save_bill(bill)
Exemplo n.º 39
0
    def scrape_xml(self, chamber, session):
        start_letter = "S" if chamber == "upper" else "H"
        sponsor_type_dict = {"3": "senate cosponsor", "4": "sponsor", "5": "sponsor"}
        version_url = "http://www1.legis.ga.gov/legis/%s/versions/" % session

        summary_url = "http://www1.legis.ga.gov/legis/%s/list/BillSummary.xml" % session
        xml = self.urlopen(summary_url)
        doc = lxml.etree.fromstring(xml)

        for bxml in doc.xpath("//Bill"):
            type = bxml.get("Type")

            # if this is from the other chamber skip it
            if not type.startswith(start_letter):
                continue

            bill_id = type + bxml.get("Num") + bxml.get("Suffix")
            if type in ("HB", "SB"):
                type = "bill"
            elif type in ("HR", "SR"):
                type = "resolution"
            else:
                raise ValueError("unknown type: %s" % type)

            # use short_title as title and long as description
            title = bxml.xpath("Short_Title/text()")[0]
            description = bxml.xpath("Title/text()")[0]

            bill = Bill(session, chamber, bill_id, title, type=type, description=description)
            bill.add_source(summary_url)

            for sponsor in bxml.xpath("Sponsor"):
                sponsor_name, code = sponsor.text.rsplit(" ", 1)
                sponsor_name = sponsor_name.replace(",", ", ")
                bill.add_sponsor(sponsor_type_dict[sponsor.get("Type")], sponsor_name, _code=code)

            for version in bxml.xpath("Versions/Version"):
                # NOTE: it is possible to get PDF versions by using .get('Id')
                # ex. URL:  legis.ga.gov/Legislation/20112012/108025.pdf
                # for now we just get HTML
                description, file_id = version.xpath("*/text()")
                bill.add_version(description, version_url + file_id)

            for action in bxml.xpath("StatusHistory/Status"):
                date = datetime.datetime.strptime(action.get("StatusDate"), "%Y-%m-%dT%H:%M:%S")
                code = action.get("StatusCode")
                if code in ("EFF", "Signed Gov"):
                    actor = "executive"
                elif code[0] == "S":
                    actor = "upper"
                elif code[0] == "H":
                    actor = "lower"

                atype = self._action_codes[code]

                bill.add_action(actor, action.text, date, atype)

            self.save_bill(bill)
Exemplo n.º 40
0
    def scrape_bill(self, chamber, session, bill_id, url):
        page = lxml.html.fromstring(self.urlopen(url))

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()

        if 'JR' in bill_id:
            bill_type = ['joint resolution']
        elif 'CR' in bill_id:
            bill_type = ['concurrent resolution']
        elif 'R' in bill_id:
            bill_type = ['resolution']
        else:
            bill_type = ['bill']

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill['subjects'] = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if 'otherAuth' in link.attrib['id']:
                bill.add_sponsor('coauthor', name)
            else:
                bill.add_sponsor('author', name)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == 'None':
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == 'H':
                actor = 'lower'
            elif actor == 'S':
                actor = 'upper'

            bill.add_action(actor, action, date,
                            type=action_type(action))

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        for link in version_table.xpath(".//a[contains(@href, '.DOC')]"):
            version_url = link.attrib['href']
            if 'COMMITTEE REPORTS' in version_url:
                continue

            name = link.text.strip()
            bill.add_version(name, version_url)

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            self.scrape_votes(bill, urlescape(link.attrib['href']))

        self.save_bill(bill)
Exemplo n.º 41
0
    def scrape_bill(self, session, bill_id, chamber):
        #https://malegislature.gov/Bills/189/SD2739
        session_for_url =  self.replace_non_digits(session)
        bill_url = u'https://malegislature.gov/Bills/{}/{}'.format(session_for_url, bill_id)

        print bill_url

        try:
            response = requests.get(bill_url)
        except requests.exceptions.RequestException as e:
            self.warning(u'Server Error on {}'.format(bill_url))
            return False 

        html = response.text

        page = lxml.html.fromstring(html)

        if page.xpath('//div[contains(@class, "followable")]/h1/text()'):
            bill_number = page.xpath('//div[contains(@class, "followable")]/h1/text()')[0]
        else:
            self.warning(u'Server Error on {}'.format(bill_url))
            return False             

        bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[0]

        bill_summary = ''
        if page.xpath('//p[@id="pinslip"]/text()'):
            bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0]

        bill_id = re.sub(r'[^S|H|\d]','',bill_id)

        bill = Bill(session, chamber,bill_id, bill_title,
                    summary=bill_summary)
        bill.add_source(bill_url)


        #https://malegislature.gov/Bills/189/SD2739 has a presenter
        #https://malegislature.gov/Bills/189/S2168 no sponsor
        # Find the non-blank text of the dt following Sponsor or Presenter,
        # including any child link text.
        sponsor = page.xpath('//dt[text()="Sponsor:" or text()="Presenter:"]/following-sibling::dd/descendant-or-self::*/text()[normalize-space()]')
        if sponsor:
            sponsor = sponsor[0].strip()
            bill.add_sponsor('primary', sponsor)

        has_cosponsor = page.xpath('//a[starts-with(normalize-space(.),"Petitioners")]')
        if has_cosponsor:
            self.scrape_cosponsors(bill, bill_url)

        version = page.xpath("//div[contains(@class, 'modalBtnGroup')]/a[contains(text(), 'Download PDF') and not(@disabled)]/@href")
        if version:
            version_url = "https://malegislature.gov{}".format(version[0])
            bill.add_version('Bill Text', version_url,
                    mimetype='application/pdf')

        self.scrape_actions(bill, bill_url)

        self.save_bill(bill)
Exemplo n.º 42
0
    def get_bill_info(self, chamber, session, bill_detail_url,
                      version_list_url):
        """Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        if chamber == "House":
            chamber = 'lower'
        else:
            chamber = 'upper'

        with self.urlopen(bill_detail_url) as bill_html:
            doc = lxml.html.fromstring(bill_html)

            bill_id = doc.xpath('//title/text()')[0].split()[0]
            bill_title = doc.xpath('//font[@size=-1]/text()')[0]
            bill_type = {
                'F': 'bill',
                'R': 'resolution',
                'C': 'concurrent resolution'
            }[bill_id[1]]
            bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)
            bill['subjects'] = self._subject_mapping[bill_id]
            bill.add_source(bill_detail_url)

            # grab sponsors
            sponsors = doc.xpath(
                '//table[@summary="Show Authors"]/descendant::a/text()')
            if sponsors:
                primary_sponsor = sponsors[0].strip()
                bill.add_sponsor('primary', primary_sponsor)
                cosponsors = sponsors[1:]
                for leg in cosponsors:
                    bill.add_sponsor('cosponsor', leg.strip())

            # Add Actions performed on the bill.
            bill_actions = self.extract_bill_actions(doc, chamber)
            for action in bill_actions:
                bill.add_action(action['action_chamber'],
                                action['action_text'],
                                action['action_date'],
                                type=action['action_type'])

        # Get all versions of the bill.
        # Versions of a bill are on a separate page, linked to from the column
        # labeled, "Bill Text", on the search results page.
        with self.urlopen(version_list_url) as version_html:
            if 'resolution' in version_html.response.url:
                bill.add_version('resolution text', version_html.response.url)
            else:
                version_doc = lxml.html.fromstring(version_html)
                for v in version_doc.xpath(
                        '//a[starts-with(@href, "/bin/getbill.php")]'):
                    version_url = urlparse.urljoin(VERSION_URL_BASE,
                                                   v.get('href'))
                    bill.add_version(v.text.strip(), version_url)

        self.save_bill(bill)
Exemplo n.º 43
0
    def scrape(self, chamber, session):
        self.log(self.metadata['session_details'])
        self.site_id = self.metadata['session_details'][session]['internal_id']
        chamber_piece = {'upper': 'Senate',
                         'lower': 'House+of+Representatives'}[chamber]

        # resolutions
        # http://alisondb.legislature.state.al.us/acas/SESSResosBySelectedMatterTransResults.asp?WhichResos=Senate&TransCodes={All}&LegDay={All}%22&GetBillsTrans=Get+Resolutions+by+Transaction

        url = 'http://alisondb.legislature.state.al.us/acas/SESSBillsBySelectedMatterTransResults.asp?TransCodes={All}&LegDay={All}&WhichBills=%s' % chamber_piece

        cookie = self.refresh_session()

        agent = FakeFirefoxURLopener()
        agent.addheader('Cookie', cookie)
        page = agent.open(url)
        doc = lxml.html.fromstring(page.read())

        # bills are all their own table with cellspacing=4 (skip first)
        bill_tables = doc.xpath('//table[@cellspacing="4"]')
        for bt in bill_tables[1:]:

            # each table has 3 rows: detail row, description, blank
            details, desc, _ = bt.xpath('tr')

            # first <tr> has img, button, sponsor, topic, current house
            #   current status, committee, committee2, last action
            _, button, sponsor, topic, _, _, com1, com2, _ = details.xpath('td')

            # pull bill_id out of script tag (gross)
            bill_id = bill_id_re.search(button.text_content()).group()
            self.log(bill_id)
            oid = btn_re.search(button.text_content()).groups()[0]

            sponsor = sponsor.text_content()
            topic = topic.text_content()
            com1 = com1.text_content()
            com2 = com2.text_content()
            desc = desc.text_content()

            # create bill
            bill = Bill(session, chamber, bill_id, desc.strip(),
                        topic=topic)
            bill.add_sponsor(sponsor, 'primary')

            self.get_sponsors(bill, oid)
            self.get_actions(bill, oid)

            # craft bill URL
            session_fragment = '2010rs'
            type_fragment = 'bills'
            bill_id_fragment = bill_id.lower()
            bill_text_url = 'http://alisondb.legislature.state.al.us/acas/searchableinstruments/%s/%s/%s.htm' % (
                session_fragment, type_fragment, bill_id_fragment)
            bill.add_version('bill text', bill_text_url)

            self.save_bill(bill)
Exemplo n.º 44
0
    def scrape_bill(self, term, bill_url):

        with self.urlopen(bill_url) as page:
            page = lxml.html.fromstring(page)
            
            chamber1 = page.xpath('//span[@id="lblBillSponsor"]/a[1]')[0].text
            
            if len(page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')) > 0:
            
                chamber2 = page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')[0].text

                if '*' in chamber1:
                    bill_id = chamber1.replace(' ', '')[1:len(chamber1)]
                    secondary_bill_id = chamber2.replace(' ', '')
                else:
                    bill_id = chamber2.replace(' ', '')[1:len(chamber2)]
                    secondary_bill_id = chamber1.replace(' ', '')
                
                primary_chamber = 'lower' if 'H' in bill_id else 'upper'

            else:
                primary_chamber = 'lower' if 'H' in chamber1 else 'upper'
                bill_id = chamber1.replace(' ', '')[1:len(chamber1)]
                secondary_bill_id = None
            
            title = page.xpath("//span[@id='lblAbstract']")[0].text

            bill = Bill(term, primary_chamber, bill_id, title, secondary_bill_id=secondary_bill_id)
            bill.add_source(bill_url)
            
            # Primary Sponsor
            sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1]
            sponsor = sponsor.replace('*','').strip()
            bill.add_sponsor('primary',sponsor)
            
            # Co-sponsors unavailable for scraping (loaded into page via AJAX)
            
            # Full summary doc
            summary = page.xpath("//span[@id='lblBillSponsor']/a")[0]
            bill.add_document('Full summary', summary.get('href'))
            
            # Actions
            tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']")
            actions_table = tables[0]
            action_rows = actions_table.xpath("tr[position()>1]")
            for ar in action_rows:
                action_taken = ar.xpath("td")[0].text
                action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y')
                #NEED TO ADD SECONDARY ACTIONS
                bill.add_action(primary_chamber, action_taken, action_date)

            votes_link = page.xpath("//span[@id='lblBillVotes']/a")
            if(len(votes_link) > 0):
                votes_link = votes_link[0].get('href')
                bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,))

            self.save_bill(bill)
Exemplo n.º 45
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        html = self.urlopen(url)
        if "error '80020009'" in html:
            self.warning('asp error on page, skipping %s', bill_id)
            return
        doc = lxml.html.fromstring(html)
        # search for Titulo, accent over i messes up lxml, so use 'tulo'
        title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()')
        if not title:
            raise NoSuchBill()
        bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
        author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0]
        for aname in author.split(','):
            aname = self.clean_name(aname).strip()
            if aname:
                bill.add_sponsor('primary', aname)
        co_authors = doc.xpath(u'//td/b[contains(text(),"Co-autor")]/../text()')
        if len(co_authors) != 0:
            for co_author in co_authors[1].split(','):
                bill.add_sponsor('cosponsor', self.clean_name(co_author).strip());
        action_table = doc.xpath('//table')[-1]
        for row in action_table[1:]:
            tds = row.xpath('td')
            # ignore row missing date
            if len(tds) != 2:
                continue
            if tds[0].text_content():
                date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y")
            action = tds[1].text_content().strip()
            #parse the text to see if it's a new version or a unrelated document
            #if has - let's *shrug* assume it's a vote document

            #get url of action
            action_url = tds[1].xpath('a/@href')
            atype,action = self.parse_action(chamber,bill,action,action_url,date)
            if atype == 'bill:passed' and action_url:
                vote_chamber  = None
                for pattern, vote_chamber in _voteChambers:
                   if re.match(pattern,action):
                       break

                else:
                   self.warning('coudnt find voteChamber pattern')

                if vote_chamber == 'lower' and len(action_url) > 0:
                    vote = self.scrape_votes(action_url[0], action,date,
                                             vote_chamber)
                    if not vote[0] == None:
                        vote[0].add_source(action_url[0])
                        bill.add_vote(vote[0])
                    else:
                        self.warning('Problem Reading vote: %s,%s' %
                                     (vote[1], bill_id))

        bill.add_source(url)
        self.save_bill(bill)
Exemplo n.º 46
0
    def scrape_bill_page(self, chamber, session, bill_url, bill_type):
        page = self.lxmlize(bill_url)
        author = self.get_one_xpath(page, "//a[@id='ctl00_PageBody_LinkAuthor']/text()")

        sbp = lambda x: self.scrape_bare_page(page.xpath("//a[contains(text(), '%s')]" % (x))[0].attrib["href"])

        authors = [x.text for x in sbp("Authors")]

        try:
            digests = sbp("Digests")
        except IndexError:
            digests = []

        try:
            versions = sbp("Text")
        except IndexError:
            versions = []

        title = page.xpath("//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0]
        actions = page.xpath("//div[@id='ctl00_PageBody_PanelBillInfo']/" "/table[@style='font-size:small']/tr")

        bill_id = page.xpath("//span[@id='ctl00_PageBody_LabelBillID']/text()")[0]

        bill_type = {"B": "bill", "CR": "concurrent resolution"}[bill_type[1:]]
        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(bill_url)

        authors.remove(author)
        bill.add_sponsor("primary", author)
        for author in authors:
            bill.add_sponsor("cosponsor", author)

        for digest in digests:
            bill.add_document(digest.text, digest.attrib["href"], mimetype="application/pdf")

        for version in versions:
            bill.add_version(version.text, version.attrib["href"], mimetype="application/pdf")

        flags = {"prefiled": ["bill:filed"], "referred to the committee": ["committee:referred"]}

        for action in actions:
            date, chamber, page, text = [x.text for x in action.xpath(".//td")]
            date += "/%s" % (session)  # Session is April --> June. Prefiles
            # look like they're in January at earliest.
            date = dt.datetime.strptime(date, "%m/%d/%Y")
            chamber = {"S": "upper", "H": "lower", "J": "joint"}[chamber]

            cat = []
            for flag in flags:
                if flag in text.lower():
                    cat += flags[flag]

            if cat == []:
                cat = ["other"]
            bill.add_action(chamber, text, date, cat)

        self.save_bill(bill)
Exemplo n.º 47
0
    def scrape_bills(self, chamber, session, subjects):
        idex = bill_start_numbers(session)[chamber]
        FROM="ctl00$rilinContent$txtBillFrom"
        TO="ctl00$rilinContent$txtBillTo"
        YEAR="ctl00$rilinContent$cbYear"
        blocks = "FOO" # Ugh.
        while len(blocks) > 0:
            default_headers = get_default_headers( SEARCH_URL )
            default_headers[FROM] = idex
            default_headers[TO]   = idex + MAXQUERY
            default_headers[YEAR] = session
            idex += MAXQUERY
            #headers = urllib.urlencode( default_headers )
            blocks = self.parse_results_page(self.post(SEARCH_URL,
                                             data=default_headers).text)
            blocks = blocks[1:-1]
            blocks = self.digest_results_page(blocks)

            for block in blocks:
                bill = blocks[block]
                subs = []
                try:
                    subs = subjects[bill['bill_id']]
                except KeyError:
                    pass

                title = bill['title'][len("ENTITLED, "):]
                billid = bill['bill_id']
                try:
                    subs   = subjects[bill['bill_id']]
                except KeyError:
                    subs   = []

                for b in BILL_NAME_TRANSLATIONS:
                    if billid[:len(b)] == b:
                        billid = BILL_NAME_TRANSLATIONS[b] + \
                            billid[len(b)+1:].split()[0]

                b = Bill(session, chamber, billid, title,
                    type=self.get_type_by_name(bill['bill_id']),
                    subjects=subs
                )

                self.process_actions( bill['actions'], b )
                sponsors = bill['sponsors'][len("BY"):].strip()
                sponsors = sponsors.split(",")
                sponsors = [ s.strip() for s in sponsors ]

                for href in bill['bill_id_hrefs']:
                    b.add_version( href.text, href.attrib['href'],
                        mimetype="application/pdf" )

                for sponsor in sponsors:
                    b.add_sponsor("primary", sponsor)

                b.add_source( SEARCH_URL )
                self.save_bill(b)
Exemplo n.º 48
0
    def scrape_bill(self, chamber, term, bill_id, url, title, subject=None):
        self.logger.info('GET ' + url)
        resp = self.get(url)
        html = resp.text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        bill = Bill(term, chamber, bill_id, title)
        bill.add_source(url)
        if subject is not None:
            bill['subjects'] = [subject]

        # Sponsors
        sponsor_map = {
            'author': 'primary',
            'co-author': 'cosponsor',
            'sponsor': 'cosponsor',
            'co-sponsor': 'cosponsor',
        }
        for div in doc.xpath('//div[contains(@class, "bill-author-info")]'):
            name = div.xpath('string(b)').strip()
            sp_type = sponsor_map[div.xpath('string(p)').strip().lower()]
            bill.add_sponsor(sp_type, name)

        # Actions
        for li in doc.xpath('//div[@id="bill-actions"]//li')[::-1]:
            if li.text_content() == 'None currently available.':
                continue
            chamber_str = li.xpath('string(strong)').strip()
            action_chamber = dict(H='lower', S='upper')[chamber_str]
            action_date = li.xpath('string(span[@class="document-date"])')
            action_date = datetime.datetime.strptime(action_date.strip(),
                                                     '%m/%d/%Y')
            action_text = li.xpath('string(span[2])').strip()
            if not action_text.strip():
                continue
            kwargs = dict(date=action_date,
                          actor=action_chamber,
                          action=action_text)
            kwargs.update(**self.categorizer.categorize(action_text))
            bill.add_action(**kwargs)

        # Documents (including votes)
        for doc_type, doc_meta in BillDocuments(self, doc):
            if doc_type == 'version':
                bill.add_version(doc_meta.title or doc_meta.text,
                                 url=doc_meta.url,
                                 mimetype='application/pdf')
            elif doc_type == 'document':
                bill.add_document(doc_meta.title or doc_meta.text,
                                  url=doc_meta.url,
                                  mimetype='application/pdf')
            elif doc_type == 'rollcall':
                self.add_rollcall(chamber, bill, doc_meta)

        self.save_bill(bill)
Exemplo n.º 49
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = self.urlopen(url)
        except scrapelib.HTTPError:
            self.warning("couldn't open %s, skipping bill" % url)
            return
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        header = page.xpath('//h3/br')[0].tail.replace('&nbsp;', ' ')
        title, primary_sponsor = header.split(' -- ')

        if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'):
            bill_type = ['bill']
        elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'):
            bill_type = ['resolution']
        elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'):
            bill_type = ['concurrent resolution']
        elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'):
            bill_type = ['joint resolution']

        for flag in SUB_BLACKLIST:
            if flag in bill_id:
                bill_id = bill_id.replace(flag, " ")
        bill_id = re.sub("\s+", " ", bill_id).strip()

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_sponsor('primary', primary_sponsor)
        bill.add_source(url)

        for link in page.xpath(
                '//a[contains(@href, "bills/") and text() = "HTML"]'):

            name = link.getprevious().tail.strip()
            bill.add_version(name, link.attrib['href'], mimetype="text/html")
            next = link.getnext()
            if next.text == "PDF":
                bill.add_version(name,
                                 next.attrib['href'],
                                 mimetype="application/pdf")

        for link in page.xpath(
                "//a[contains(@href, 'fnotes') and text() = 'HTML']"):

            bill.add_document("Fiscal Note", link.attrib['href'])

        subjects = []
        for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
            subjects.append(link.text.strip())
        bill['subjects'] = subjects

        status_link = page.xpath('//a[contains(@href, "billsta")]')[0]
        self.parse_status(bill, status_link.attrib['href'])

        self.save_bill(bill)
Exemplo n.º 50
0
    def scrape_assem_bills(self, chamber, insert, session, year):

        doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution',
                    6: 'joint resolution',9:'petition'}
        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
                page = self.get(page_path).text
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)
                root.make_links_absolute("http://www.leg.state.nv.us/")

                bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                title = self.get_node(
                    root,
                    '//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
                    'b[contains(text(), "By:")]]/td/em/text()')

                bill = Bill(session, chamber, bill_id, title,
                            type=bill_type)
                bill['subjects'] = list(set(self.subject_mapping[bill_id]))
                billtext = root.xpath("//b[text()='Bill Text']")[0].getparent().getnext()
                text_urls = billtext.xpath("./a")
                for text_url in text_urls:
                    version_name = text_url.text.strip()
                    version_url = text_url.attrib['href']
                    bill.add_version(version_name, version_url,
                                 mimetype='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsor('primary', leg)
                for leg in secondary:
                    bill.add_sponsor('cosponsor', leg)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes =  mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes"
                    bill.add_document(minutes_date, minutes_url)
                    minutes_count = minutes_count + 1


                self.scrape_actions(root, bill, "lower")
                self.scrape_votes(page, page_path, bill, insert, year)
                bill.add_source(page_path)
                self.save_bill(bill)
Exemplo n.º 51
0
    def scrape_bill(self, chamber, session, billid, histurl, year):
        if year[0] != 'R':
            session = year
        else:
            session = self.metadata['session_details'][year]['sub_sessions'][
                int(year[0]) - 1]

        with self.urlopen(histurl) as data:
            soup = BeautifulSoup(cleansource(data))
            basicinfo = soup.findAll('div', id='bhistleft')[0]
            hist = basicinfo.table

            sponsor = None
            title = None
            for b in basicinfo.findAll('b'):
                if b.next.startswith('SUMMARY'):
                    title = b.findNextSiblings(text=True)[0].strip()
                elif b.next.startswith('SPONSOR'):
                    for a in b.findNextSiblings('a'):
                        if not issponsorlink(a):
                            break
                        sponsor = cleansponsor(a.contents[0])

            bill = Bill(session, chamber, billid, title)

            if sponsor:
                bill.add_sponsor('primary', sponsor)

            for row in hist.findAll('tr'):
                link = row.td.a
                vlink = urlbase % link['href']
                vname = link.contents[0].strip()
                bill.add_version(vname, vlink)

            history = soup.findAll('div', id='bhisttab')[0].table
            rows = history.findAll('tr')[1:]
            for row in rows:
                tds = row.findAll('td')
                if len(tds) < 2:
                    # This is not actually an action
                    continue
                date, action = row.findAll('td')[:2]
                date = dt.datetime.strptime(date.contents[0], '%m/%d/%y')
                action = action.contents[0].strip()
                if 'House' in action:
                    actor = 'lower'
                elif 'Senate' in action:
                    actor = 'upper'
                else:  # for lack of a better
                    actor = chamber

                bill.add_action(actor, action, date)

        self.save_bill(bill)
Exemplo n.º 52
0
    def scrape_senate_bills(self, chamber, insert, session, year):
        doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution',
                    8: 'joint resolution'}

        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)

                page = self.get(page_path).text
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)

                bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                title = self.get_node(
                    root,
                    '//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
                    'b[contains(text(), "By:")]]/td/em/text()')

                bill = Bill(session, chamber, bill_id, title,
                            type=bill_type)
                bill['subjects'] = list(set(self.subject_mapping[bill_id]))

                for table in root.xpath('//div[@id="content"]/table'):
                    if 'Bill Text' in table.text_content():
                        bill_text = table.xpath("string(tr/td[2]/a/@href)")
                        text_url = "http://www.leg.state.nv.us" + bill_text
                        bill.add_version("Bill Text", text_url,
                                         mimetype='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsor('primary', leg)
                for leg in secondary:
                    bill.add_sponsor('cosponsor', leg)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes =  mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda"
                    bill.add_document(minutes_date, minutes_url)
                    minutes_count = minutes_count + 1

                self.scrape_actions(root, bill, "upper")
                self.scrape_votes(page, page_path, bill, insert, year)
                bill.add_source(page_path)
                self.save_bill(bill)
Exemplo n.º 53
0
    def parse_bill(self, chamber, session, bill_id, bill_info_url):
        with self.urlopen(bill_info_url) as bill_info_data:
            bill_info = self.soup_parser(bill_info_data)
            version_url = '%s/bill.doc' % bill_id
            version_link = bill_info.find(href=version_url)

            if not version_link:
                # This bill was withdrawn
                return

            bill_title = version_link.findNext('p').contents[0].strip()

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_version("Most Recent Version",
                             session_url(session) + version_url)
            bill.add_source(bill_info_url)

            sponsor_links = bill_info.findAll(href=re.compile(
                    'legislator/[SH]\d+\.htm'))

            for sponsor_link in sponsor_links:
                bill.add_sponsor('primary', sponsor_link.contents[0].strip())

            action_p = version_link.findAllNext('p')[-1]
            for action in action_p.findAll(text=True):
                action = action.strip()
                if (not action or action == 'last action' or
                    'Prefiled' in action):
                    continue

                action_date = action.split('-')[0]
                action_date = dt.datetime.strptime(action_date, '%b %d')
                # Fix:
                action_date = action_date.replace(
                    year=int('20' + session[2:4]))

                action = '-'.join(action.split('-')[1:])

                if action.endswith('House') or action.endswith('(H)'):
                    actor = 'lower'
                elif action.endswith('Senate') or action.endswith('(S)'):
                    actor = 'upper'
                else:
                    actor = chamber

                bill.add_action(actor, action, action_date)

            vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf'))
            if vote_link:
                bill.add_document(
                    'vote_history.pdf',
                    bill_info_url.replace('.htm', '') + "/vote_history.pdf")

            self.save_bill(bill)
Exemplo n.º 54
0
    def scrape_bill(self, session, chamber, bill_type, bill_url):
        with self.urlopen(bill_url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(bill_url)

            # split "SB1 SD2 HD2" to get SB1
            bill_id = page.xpath('//a[@id="LinkButtonMeasure"]')[0].text_content().split()[0]

            title = page.xpath('//span[@id="ListView1_ctrl0_measure_titleLabel"]')[0].text
            subjects = page.xpath('//span[@id="ListView1_ctrl0_report_titleLabel"]')[0].text.split('; ')
            subjects = [s.strip() for s in subjects if s.strip()]
            description = page.xpath('//span[@id="ListView1_ctrl0_descriptionLabel"]')[0].text
            sponsors = page.xpath('//span[@id="ListView1_ctrl0_introducerLabel"]')[0].text
            referral = page.xpath('//span[contains(@id, "referral")]/text()')[0]

            bill = Bill(session, chamber, bill_id, title, subjects=subjects,
                        type=bill_type, description=description, referral=referral)
            for sponsor in sponsors.split(', '):
                if sponsor.endswith(' (BR)'):
                    sponsor = sponsor[:-5]
                bill.add_sponsor('primary', sponsor)

            # actions
            actions = []

            table = page.xpath('//table[@id="GridViewStatus"]')[0]
            for row in table.xpath('tr'):
                action_params = {}
                cells = row.xpath('td')
                if len(cells) == 3:
                    ch = cells[1].xpath('font')[0].text
                    action_params['actor'] = house[ch]
                    action_params['action'] = cells[2].xpath('font')[0].text
                    action_date = cells[0].xpath('font')[0].text
                    action_params['date'] = datetime.strptime(action_date, "%m/%d/%Y")
                    action_params['type'] = categorize_action(action_params['action'])
                    actions.append(action_params)
            for action_params in actions:
                bill.add_action(**action_params)

                self.parse_vote(bill, action_params['action'],
                                action_params['actor'], action_params['date'])

            # add versions
            try:
                for version in page.xpath('//a[contains(@id, "StatusLink")]'):
                    bill.add_version(version.text.replace('_', ' '),
                                     version.get('href'))
            except IndexError: # href not found.
                pass

        bill.add_source(bill_url)
        self.save_bill(bill)
Exemplo n.º 55
0
    def parse_senate_billpage(self, bill_url, year):
        bill_page = self.urlopen(bill_url)
        bill_page = lxml.html.fromstring(bill_page)
        # get all the info needed to record the bill
        # TODO probably still needs to be fixed
        bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content()
        bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content()
        bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content()
        bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()
        #print "bill id = "+ bill_id

        bill_type = "bill"
        triplet = bill_id[:3]
        if triplet in bill_types:
            bill_type = bill_types[triplet]

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self.subjects:
            subs = self.subjects[bid]
            self.log("With subjects for this bill")

        self.log(bid)

        bill = Bill(year, 'upper', bill_id, bill_desc,
                    bill_lr=bill_lr, type=bill_type, subjects=subs)
        bill.add_source(bill_url)

        # Get the primary sponsor
        sponsor = bill_page.xpath('//*[@id="hlSponsor"]')[0]
        bill_sponsor = sponsor.text_content()
        bill_sponsor_link = sponsor.attrib.get('href')
        bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link)

        # cosponsors show up on their own page, if they exist
        cosponsor_tag = bill_page.xpath('//*[@id="hlCoSponsors"]')
        if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.has_key('href'):
            self.parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href'])

        # get the actions
        action_url = bill_page.xpath('//*[@id="hlAllActions"]')
        if len(action_url) > 0:
            action_url =  action_url[0].attrib['href']
            #print "actions = %s" % action_url
            self.parse_senate_actions(bill, action_url)

        # stored on a separate page
        versions_url = bill_page.xpath('//*[@id="hlFullBillText"]')
        if len(versions_url) > 0 and versions_url[0].attrib.has_key('href'):
            self.parse_senate_bill_versions(bill, versions_url[0].attrib['href'])

        self.save_bill(bill)