Exemplo n.º 1
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            # search for Titulo, accent over i messes up lxml, so use 'tulo'
            title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()')
            if not title:
                raise NoSuchBill()
            bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
            author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0]
            bill.add_sponsor('primary', author.strip())

            action_table = doc.xpath('//table')[-1]
            for row in action_table[1:]:
                tds = row.xpath('td')

                # ignore row missing date
                if len(tds) != 2:
                    continue


                date = datetime.datetime.strptime(tds[0].text_content(),
                                                  "%m/%d/%Y")
                action = tds[1].text_content()
                bill.add_action(chamber, action, date)

                # also has an associated version
                if tds[1].xpath('a'):
                    bill.add_version(action, tds[1].xpath('a/@href')[0])

            bill.add_source(url)
            self.save_bill(bill)
Exemplo n.º 2
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt"
        page = self.urlopen(url).decode('latin-1')
        page = unicode_csv_reader(StringIO.StringIO(page), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]
            if bill_chamber != chamber:
                continue

            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial resolution',
                'CMR': 'concurrent memorial resolution'}[type_spec]

            bill = Bill('2011', chamber, bill_id, row[3], type=bill_type)
            bill.add_source(url)
            bill.add_sponsor('lead sponsor', row[11])

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/%s.pdf" % (
                               session, bill_id.replace(' ', '')))
            bill.add_version(bill_id, version_url)

            self.scrape_votes(bill)

            self.bills[bill_id] = bill
Exemplo n.º 3
0
    def scrape2003(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Grab the interesting tables on the page.
            tables = page.cssselect('center table')

            # Bill
            name = tables[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Sponsorships
            for a in tables[1].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            center = page.cssselect('center table center')[0]

            for row in center.cssselect('table')[-2].cssselect('tr')[2:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                if '/' not in date:
                    continue
                if action_text.startswith('Senate'):
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in center.cssselect('table')[-1].cssselect('a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')))

            self.save_bill(bill)
Exemplo n.º 4
0
    def scrape2001(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2001_02/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Grab the interesting tables on the page.
            tables = page.cssselect("table center table")

            # Bill
            name = tables[0].text_content().split("-", 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Sponsorships
            for a in tables[1].cssselect("a"):
                bill.add_sponsor("", a.text_content().strip())

            # Actions
            center = page.cssselect("table center")[-1]

            for row in center.cssselect("table table")[0].cssselect("tr")[2:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                if "/" not in date:
                    continue
                if action_text.startswith("Senate"):
                    action_text = action_text.split(" ", 1)[1].strip()
                    bill.add_action("upper", action_text, date)
                elif action_text.startswith("House"):
                    action_text = action_text.split(" ", 1)[1].strip()
                    bill.add_action("lower", action_text, date)

            # Versions
            for row in center.cssselect("table table")[1].cssselect("a"):
                bill.add_version(a.text_content(), urlparse.urljoin(url, a.get("href")))

            self.save_bill(bill)
Exemplo n.º 5
0
    def scrape1999(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm"
        with self.lxml_context(url) as lxml:
            # Grab the interesting tables on the page.
            tables = page.cssselect("table")

            # Bill
            name = tables[1].cssselect("a")[0].text_content().split("-", 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version("Current", url.replace("/sum/", "/fulltext/"))

            # Sponsorships
            for a in tables[2].cssselect("a"):
                bill.add_sponsor("", a.text_content().strip())

            # Actions
            for row in tables[-1].cssselect("tr"):
                senate_date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                house_date = row[2].text_content().strip()
                if "/" not in senate_date and "/" not in house_date:
                    continue
                if senate_date:
                    bill.add_action("upper", action_text, senate_date)
                if house_date:
                    bill.add_action("lower", action_text, house_date)

            self.save_bill(bill)
Exemplo n.º 6
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt"
        page = self.urlopen(url).decode("latin-1")
        page = unicode_csv_reader(StringIO.StringIO(page), delimiter="|")

        for row in page:
            bill_chamber = {"H": "lower", "S": "upper"}[row[0]]
            if bill_chamber != chamber:
                continue

            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r"(H|S)([A-Z]+)\s", bill_id).group(2)
            bill_type = {
                "B": "bill",
                "R": "resolution",
                "JR": "joint resolution",
                "CR": "concurrent resolution",
                "MR": "memorial resolution",
                "CMR": "concurrent memorial resolution",
            }[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(session, chamber, bill_id, row[3], type=bill_type)
            bill.add_source(url)
            bill.add_sponsor("lead sponsor", row[11])

            version_url = "ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/%s.pdf" % (session, bill_id.replace(" ", ""))
            bill.add_version(bill_id, version_url)

            self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
Exemplo n.º 7
0
    def scrape_bill(self, session, chamber, bill_id, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath("//br")[8].tail
            if not title:
                return
            title = title.strip()

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(url)

            action_link = page.xpath("//a[contains(@href, 'getActions')]")[0]
            self.scrape_actions(bill, action_link.attrib['href'])

            version_path = "//a[contains(., '%s')]"
            for version_type in ('Introduced Bill', 'House Bill',
                                 'Senate Bill', 'Engrossed Bill',
                                 'Enrolled Act'):
                path = version_path % version_type
                links = page.xpath(path)
                if links:
                    bill.add_version(version_type, links[0].attrib['href'])

            for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"):
                num = doc_link.text.strip().split("(")[0]
                bill.add_document("Fiscal Impact Statement #%s" % num,
                                  doc_link.attrib['href'])

            bill['subjects'] = self.subjects[bill_id]

            self.save_bill(bill)
Exemplo n.º 8
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.urlopen(url))
        except scrapelib.HTTPError as e:
            self.warning("error (%s) fetching %s, skipping" % (e, url))
            return

        title = page.xpath("string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()

        if "JR" in bill_id:
            bill_type = ["joint resolution"]
        elif "CR" in bill_id:
            bill_type = ["concurrent resolution"]
        elif "R" in bill_id:
            bill_type = ["resolution"]
        else:
            bill_type = ["bill"]

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill["subjects"] = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if "otherAuth" in link.attrib["id"]:
                bill.add_sponsor("coauthor", name)
            else:
                bill.add_sponsor("author", name)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == "None":
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == "H":
                actor = "lower"
            elif actor == "S":
                actor = "upper"

            bill.add_action(actor, action, date, type=action_type(action))

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        for link in version_table.xpath(".//a[contains(@href, '.DOC')]"):
            version_url = link.attrib["href"]
            if "COMMITTEE REPORTS" in version_url:
                continue

            name = link.text.strip()
            bill.add_version(name, version_url)

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            self.scrape_votes(bill, urlescape(link.attrib["href"]))

        self.save_bill(bill)
Exemplo n.º 9
0
    def scrape2009(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Bill
            name = page.cssselect('#legislation h1')[0].text_content().strip()

            bill_id = name.split(' - ')[0].strip()

            bill = Bill(session, chamberName, bill_id, name)

            # Sponsorships
            for a in page.cssselect("#sponsors a"):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in page.cssselect('#history tr')[1:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()

                if '/' not in date:
                    continue

                date = datetime.datetime.strptime(date, '%m/%d/%Y')

                if action_text.startswith('Senate'):
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in page.cssselect('#versions a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')))

            self.save_bill(bill)
Exemplo n.º 10
0
    def scrape(self, session, chambers):
        url = 'http://www.ontla.on.ca/web/bills/bills_all.do?locale=en&parlSessionID=%s' % session
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for row in doc.xpath('//table/tr'):
            id, title_td, sponsor = row.xpath('td')
            bill_id = id.text_content().strip()
            title = clean_spaces(title_td.text_content())
            # pull sponsor off different page
            bill = Bill(session, 'lower', bill_id, title)
            # skip to detail page
            detail_url = title_td.xpath('a/@href')[0] + "&detailPage=bills_detail_status"
            bill.add_source(url)
            bill.add_source(detail_url)

            # get actions & sponsors
            self.scrape_details(bill, detail_url)

            if not bill['versions']:
                self.warning('no versions detected via normal method, using '
                             'top-level page')
                bill.add_version('Original (current version)',
                                 title_td.xpath('a/@href')[0],
                                 mimetype='text/html')

            self.save_bill(bill)
Exemplo n.º 11
0
    def scrape_bill(self, chamber, session, bill_id):
        biennium = "%s-%s" % (session[0:4], session[7:9])
        bill_num = bill_id.split()[1]

        url = ("%s/GetLegislation?biennium=%s&billNumber"
               "=%s" % (self._base_url, biennium, bill_num))

        with self.urlopen(url) as page:
            page = lxml.etree.fromstring(page).xpath("//wa:Legislation",
                                                     namespaces=self._ns)[0]

            title = page.xpath("string(wa:LongDescription)",
                               namespaces=self._ns)

            bill_type = page.xpath(
                "string(wa:ShortLegislationType/wa:LongLegislationType)",
                namespaces=self._ns).lower()

            if bill_type == 'gubernatorial appointment':
                return

            bill = Bill(session, chamber, bill_id, title,
                        type=[bill_type])

            chamber_name = {'lower': 'House', 'upper': 'Senate'}[chamber]
            version_url = ("http://www.leg.wa.gov/pub/billinfo/2011-12/"
                           "Htm/Bills/%s %ss/%s.htm" % (chamber_name,
                                                        bill_type.title(),
                                                        bill_num))
            bill.add_version(bill_id, version_url)

            self.scrape_sponsors(bill)
            self.scrape_actions(bill)

            self.save_bill(bill)
Exemplo n.º 12
0
    def _parse_bill(self, session, chamber, source_url, line):
        if line:
            (type, combined_id, number, title, relating_to) = line.split("\xe4")
            if (type == 'HB' and chamber == 'lower') or (type == 'SB' and chamber == 'upper'):
                #
                # basic bill info
                bill_id = "%s %s" % (type, number.zfill(4))
                bill = Bill(session, chamber, bill_id, title)
                bill.add_source(source_url)

                #
                # add actions
                if self.actionsByBill.has_key(bill_id):
                    for a in self.actionsByBill[bill_id]:
                        bill.add_action(a['actor'], a['action'], a['date'])

                if self.load_versions_sponsors:
                    # add versions and sponsors
                    versionsSponsors = self.versionsSponsorsParser.fetch_and_parse(self, session, bill_id)
                    #print "versionsSponsors: %s" % str(versionsSponsors)
                    if versionsSponsors:
                        for ver in versionsSponsors['versions']:
                            bill.add_version(ver['name'], ver['url'])
                        sponsorType = 'primary'
                        if len(versionsSponsors['sponsors']) > 1:
                            sponsorType = 'cosponsor'
                        for name in versionsSponsors['sponsors']:
                            bill.add_sponsor(sponsorType, name)

                # save - writes out JSON
                self.save_bill(bill)
Exemplo n.º 13
0
    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        if chamber == 'legislature':
            chamber = 'upper'
        bill = Bill(data['legislative_session'], chamber, data['identifier'],
                    data['title'], subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(action['organization_id'])['classification']
            legislators = []
            committees = []
            for rel in action['related_entities']:
                if rel['entity_type'] == 'organization':
                    committees.append(rel['name'])
                elif rel['entity_type'] == 'person':
                    legislators.append(rel['name'])
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']),
                            committees=committees,
                            legislators=legislators,
                            **action.get('extras', {}),
                            )

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(sponsor['classification'],
                             sponsor['name'],
                             )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'], link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']),
                                 **version.get('extras', {}))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'], link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']),
                                  **doc.get('extras', {}))

        for title in data['other_titles']:
            bill.add_title(title['title'])

        for related in data['related_bills']:
            bill.add_companion(related['identifier'],
                               related['legislative_session'],
                               chamber
                               )
        self.save_bill(bill)
Exemplo n.º 14
0
    def scrape1999(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm"
        with self.lxml_context(url) as lxml:
            # Grab the interesting tables on the page.
            tables = page.cssselect('table')

            # Bill
            name = tables[1].cssselect('a')[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version('Current', url.replace('/sum/', '/fulltext/'))

            # Sponsorships
            for a in tables[2].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in tables[-1].cssselect('tr'):
                senate_date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                house_date = row[2].text_content().strip()
                if '/' not in senate_date and '/' not in house_date:
                    continue
                if senate_date:
                    bill.add_action('upper', action_text, senate_date)
                if house_date:
                    bill.add_action('lower', action_text, house_date)

            self.save_bill(bill)
Exemplo n.º 15
0
    def scrape(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billindex/"
               "BillCrossRef.aspx?type=%s" % (session, chamber_abbrev))
        page = lxml.html.fromstring(self.urlopen(url))

        for tr in page.xpath("//tr[@valign='middle']")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session, chamber, bill_id, title, type=bill_type)

            self.scrape_digest(bill)

            # versions
            for a in (tr.xpath('td[6]//a') + tr.xpath('td[9]//a') +
                      tr.xpath('td[10]//a')):
                bill.add_version(a.text, a.get('href'))

            # documents
            fnote = tr.xpath('td[7]//a')
            if fnote:
                bill.add_document('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[12]//a')
            if summary:
                bill.add_document('Summary', summary[0].get('href'))

            bill.add_source(url)
            self.save_bill(bill)
Exemplo n.º 16
0
    def scrape_assem_bills(self, chamber, insert, session, year):

        doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution',
                    6: 'joint resolution'}
        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
                with self.urlopen(page_path) as page:
                    page = page.decode("utf8").replace(u"\xa0", " ")
                    root = lxml.html.fromstring(page)

                    bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                    title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)')

                    if insert.find('Special') != -1:
                        session = insert
                    bill = Bill(session, chamber, bill_id, title,
                                type=bill_type)
                    bill_text = root.xpath("string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)")
                    text_url = "http://www.leg.state.nv.us" + bill_text
                    bill.add_version("Bill Text", text_url)


                    primary, secondary = self.scrape_sponsors(page)
                    
                    if primary[0] == 'By:':
                        primary.pop(0)
                        
                        if primary[0] == 'ElectionsProceduresEthicsand':
                            primary[0] = 'Elections Procedures Ethics and'

                        full_name = ''
                        for part_name in primary:
                            full_name = full_name + part_name + " "
                        bill.add_sponsor('primary', full_name)
                    else:
                        for leg in primary:
                            bill.add_sponsor('primary', leg)
                    for leg in secondary:
                        bill.add_sponsor('cosponsor', leg)

                    minutes_count = 2
                    for mr in root.xpath('//table[4]/tr/td[3]/a'):
                        minutes =  mr.xpath("string(@href)")
                        minutes_url = "http://www.leg.state.nv.us" + minutes
                        minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                        minutes_date = mr.xpath(minutes_date_path).split()
                        minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes"
                        bill.add_document(minutes_date, minutes_url)
                        minutes_count = minutes_count + 1


                    self.scrape_actions(root, bill, "lower")
                    self.scrape_votes(page, bill, insert, year)
                    bill.add_source(page_path)
                    self.save_bill(bill)
Exemplo n.º 17
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == 'upper':
            other_chamber = 'lower'
            bill_id = 'SB 1'
        else:
            other_chamber = 'upper'
            bill_id = 'HB 1'

        b1 = Bill(session, chamber, bill_id, 'A super bill')
        b1.add_source('http://example.com/')
        b1.add_version('As Introduced', 'http://example.com/SB1.html')
        b1.add_document('Google', 'http://google.com')
        b1.add_sponsor('primary', 'Bob Smith')
        b1.add_sponsor('secondary', 'Johnson, Sally')

        d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y')
        v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0)
        v1.yes('Smith')
        v1.yes('Johnson')

        d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y')
        v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1)
        v2.no('Bob Smith')
        v2.other('S. Johnson')

        b1.add_vote(v1)
        b1.add_vote(v2)

        b1.add_action(chamber, 'introduced', d1)
        b1.add_action(chamber, 'read first time', d2)
        b1.add_action(other_chamber, 'introduced', d2)

        self.save_bill(b1)
Exemplo n.º 18
0
    def scrape(self, session, chambers):
        # Get the progress table.
        url = 'http://www.assembly.nl.ca/business/bills/ga47session1.htm'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        for tr in doc.xpath('//table[@class="bills"]/tr')[1:]:
            bill_id = clean_spaces(tr[0].text_content()).strip('*')
            if not bill_id:
                break # empty rows extend past actual list of bills
            if bill_id.endswith("."):
                bill_id = bill_id[:-1]

            title = clean_spaces(tr[1].text_content())
            chapter = tr[-1].text_content()

            bill = Bill(session, 'lower', bill_id, title, type='bill')

            if chapter:
                bill['chapter'] = chapter

            # FIXME need to do more work to figure out what
            # version the text *really* is
            td = tr[1]
            bill_url = td.xpath('a/@href')
            if bill_url:
                bill.add_version(url=bill_url.pop(), name='First Reading',
                    mimetype='text/html')

            # Actions and version urls.
            data = zip([
                'First Reading',
                'Second Reading',
                'Committee',
                'Amendments',
                'Third Reading',
                'Royal Assent',
                'Act'],
                tr[2:-1])

            for action, td in data:
                date_text = td.text_content()
                date = None
                fmt = r'%b. %d/%Y'
                try:
                    date = datetime.datetime.strptime(date_text, fmt)
                except ValueError:
                    continue
                else:
                    break
                if date is None:
                    continue

                attrs = dict(action=action, date=date, actor='lower')
                attrs.update(self.categorizer.categorize(action))
                bill.add_action(**attrs)

            bill.add_source(url)
            self.save_bill(bill)
Exemplo n.º 19
0
    def get_bill_info(self, chamber, session, bill_detail_url, version_list_url):
        """Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        if chamber == "House":
            chamber = 'lower'
        else:
            chamber = 'upper'

        with self.urlopen(bill_detail_url) as bill_html:
            doc = lxml.html.fromstring(bill_html)

            bill_id = doc.xpath('//title/text()')[0].split()[0]
            bill_title = doc.xpath('//font[@size=-1]/text()')[0]
            bill_type = {'F': 'bill', 'R':'resolution',
                         'C': 'concurrent resolution'}[bill_id[1]]
            bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)
            bill['subjects'] = self._subject_mapping[bill_id]
            bill.add_source(bill_detail_url)

            # grab sponsors
            sponsors = doc.xpath('//table[@summary="Show Authors"]/descendant::a/text()')
            if sponsors:
                primary_sponsor = sponsors[0].strip()
                bill.add_sponsor('primary', primary_sponsor, chamber=chamber)
                cosponsors = sponsors[1:]
                for leg in cosponsors:
                    bill.add_sponsor('cosponsor', leg.strip(), chamber=chamber)

            # Add Actions performed on the bill.
            bill_actions = self.extract_bill_actions(doc, chamber)
            for action in bill_actions:
                kwargs = {}
                if 'committee' in action:
                    kwargs['committees'] = action['committees']

                bill.add_action(action['action_chamber'],
                                action['action_text'],
                                action['action_date'],
                                type=action['action_type'],
                                **kwargs)

        # Get all versions of the bill.
        # Versions of a bill are on a separate page, linked to from the column
        # labeled, "Bill Text", on the search results page.
        with self.urlopen(version_list_url) as version_html:
            if 'resolution' in version_html.response.url:
                bill.add_version('resolution text', version_html.response.url,
                                 mimetype='text/html')
            else:
                version_doc = lxml.html.fromstring(version_html)
                for v in version_doc.xpath('//a[starts-with(@href, "/bin/getbill.php")]'):
                    version_url = urlparse.urljoin(VERSION_URL_BASE,
                                                   v.get('href'),
                                                   mimetype='text/html')
                    bill.add_version(v.text.strip(), version_url)

        self.save_bill(bill)
Exemplo n.º 20
0
    def scrape_xml(self, chamber, session):
        start_letter = "S" if chamber == "upper" else "H"
        sponsor_type_dict = {"3": "senate cosponsor", "4": "sponsor", "5": "sponsor"}
        version_url = "http://www1.legis.ga.gov/legis/%s/versions/" % session

        summary_url = "http://www1.legis.ga.gov/legis/%s/list/BillSummary.xml" % session
        xml = self.urlopen(summary_url)
        doc = lxml.etree.fromstring(xml)

        for bxml in doc.xpath("//Bill"):
            type = bxml.get("Type")

            # if this is from the other chamber skip it
            if not type.startswith(start_letter):
                continue

            bill_id = type + bxml.get("Num") + bxml.get("Suffix")
            if type in ("HB", "SB"):
                type = "bill"
            elif type in ("HR", "SR"):
                type = "resolution"
            else:
                raise ValueError("unknown type: %s" % type)

            # use short_title as title and long as description
            title = bxml.xpath("Short_Title/text()")[0]
            description = bxml.xpath("Title/text()")[0]

            bill = Bill(session, chamber, bill_id, title, type=type, description=description)
            bill.add_source(summary_url)

            for sponsor in bxml.xpath("Sponsor"):
                sponsor_name, code = sponsor.text.rsplit(" ", 1)
                sponsor_name = sponsor_name.replace(",", ", ")
                bill.add_sponsor(sponsor_type_dict[sponsor.get("Type")], sponsor_name, _code=code)

            for version in bxml.xpath("Versions/Version"):
                # NOTE: it is possible to get PDF versions by using .get('Id')
                # ex. URL:  legis.ga.gov/Legislation/20112012/108025.pdf
                # for now we just get HTML
                description, file_id = version.xpath("*/text()")
                bill.add_version(description, version_url + file_id)

            for action in bxml.xpath("StatusHistory/Status"):
                date = datetime.datetime.strptime(action.get("StatusDate"), "%Y-%m-%dT%H:%M:%S")
                code = action.get("StatusCode")
                if code in ("EFF", "Signed Gov"):
                    actor = "executive"
                elif code[0] == "S":
                    actor = "upper"
                elif code[0] == "H":
                    actor = "lower"

                atype = self._action_codes[code]

                bill.add_action(actor, action.text, date, atype)

            self.save_bill(bill)
Exemplo n.º 21
0
    def scrape_bill(self, chamber, session, bill_id):
        biennium = "%s-%s" % (session[0:4], session[7:9])
        bill_num = bill_id.split()[1]

        url = ("%s/GetLegislation?biennium=%s&billNumber"
               "=%s" % (self._base_url, biennium, bill_num))

        page = self.urlopen(url)
        page = lxml.etree.fromstring(page.bytes)
        page = xpath(page, "//wa:Legislation")[0]

        title = xpath(page, "string(wa:LongDescription)")

        bill_type = xpath(
            page,
            "string(wa:ShortLegislationType/wa:LongLegislationType)")
        bill_type = bill_type.lower()

        if bill_type == 'gubernatorial appointment':
            return

        bill = Bill(session, chamber, bill_id, title,
                    type=[bill_type])

        fake_source = ("http://apps.leg.wa.gov/billinfo/"
                       "summary.aspx?bill=%s&year=%s" % (
                           bill_num, session[0:4]))
        bill.add_source(fake_source)

        chamber_name = {'lower': 'House', 'upper': 'Senate'}[chamber]
        mimetype = 'text/html'
        version_url = ("http://www.leg.wa.gov/pub/billinfo/%s/"
                       "Htm/Bills/%s %ss/%s.htm" % (biennium,
                                                    chamber_name,
                                                    bill_type.title(),
                                                    bill_num))

        # Sometimes the measure's version_url isn't guessable. When that happens
        # have to get the url from the source page.
        try:
            version_resp = self.get(version_url)
            if version_resp.status_code != 200:
                webpage = self.get(fake_source).text
                webdoc = lxml.html.fromstring(webpage)
                version_url = webdoc.xpath('//a[contains(@href, "billdocs")]/@href')[-1]
                if version_url.lower().endswith('.pdf'):
                    mimetype = 'application/pdf'
        except scrapelib.HTTPError:
            pass

        bill.add_version(bill_id, version_url, mimetype=mimetype)

        self.scrape_sponsors(bill)
        self.scrape_actions(bill, bill_num)
        self.scrape_votes(bill)
        self.fix_prefiled_action_dates(bill)

        return bill
Exemplo n.º 22
0
    def scrape_bill(self, chamber, session, bill_id, url):
        page = lxml.html.fromstring(self.urlopen(url))

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()

        if 'JR' in bill_id:
            bill_type = ['joint resolution']
        elif 'CR' in bill_id:
            bill_type = ['concurrent resolution']
        elif 'R' in bill_id:
            bill_type = ['resolution']
        else:
            bill_type = ['bill']

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill['subjects'] = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if 'otherAuth' in link.attrib['id']:
                bill.add_sponsor('coauthor', name)
            else:
                bill.add_sponsor('author', name)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == 'None':
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == 'H':
                actor = 'lower'
            elif actor == 'S':
                actor = 'upper'

            bill.add_action(actor, action, date,
                            type=action_type(action))

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        for link in version_table.xpath(".//a[contains(@href, '.DOC')]"):
            version_url = link.attrib['href']
            if 'COMMITTEE REPORTS' in version_url:
                continue

            name = link.text.strip()
            bill.add_version(name, version_url)

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            self.scrape_votes(bill, urlescape(link.attrib['href']))

        self.save_bill(bill)
Exemplo n.º 23
0
    def scrape_bill(self, session, chamber, bill_id, short_title, url):
        if bill_id in ['SCR 0003', 'SB 0251', 'SB 0292']:
            return

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            # check for Bill Withdrawn header
            h1text = page.xpath('//h1/text()')
            if h1text and h1text[0] == 'Bill Withdrawn':
                return

            title = page.xpath("//br")[8].tail
            if not title:
                title = short_title
            title = title.strip()

            abbrev = bill_id.split()[0]
            if abbrev.endswith('B'):
                bill_type = ['bill']
            elif abbrev.endswith('JR'):
                bill_type = ['joint resolution']
            elif abbrev.endswith('CR'):
                bill_type = ['concurrent resolution']
            elif abbrev.endswith('R'):
                bill_type = ['resolution']

            bill = Bill(session, chamber, bill_id, title,
                        type=bill_type)
            bill.add_source(url)

            action_link = page.xpath("//a[contains(@href, 'getActions')]")[0]
            self.scrape_actions(bill, action_link.attrib['href'])

            version_path = "//a[contains(., '%s')]"
            for version_type in ('Introduced Bill', 'House Bill',
                                 'Senate Bill', 'Engrossed Bill',
                                 'Enrolled Act'):
                path = version_path % version_type
                links = page.xpath(path)
                if links:
                    bill.add_version(version_type, links[0].attrib['href'])

            for vote_link in page.xpath("//a[contains(@href, 'Srollcal')]"):
                self.scrape_senate_vote(bill, vote_link.attrib['href'])

            for vote_link in page.xpath("//a[contains(@href, 'Hrollcal')]"):
                self.scrape_house_vote(bill, vote_link.attrib['href'])

            for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"):
                num = doc_link.text.strip().split("(")[0]
                bill.add_document("Fiscal Impact Statement #%s" % num,
                                  doc_link.attrib['href'])

            bill['subjects'] = self.subjects[bill_id]

            self.save_bill(bill)
Exemplo n.º 24
0
    def scrape_bill(self, session, bill_id, chamber):
        #https://malegislature.gov/Bills/189/SD2739
        session_for_url =  self.replace_non_digits(session)
        bill_url = u'https://malegislature.gov/Bills/{}/{}'.format(session_for_url, bill_id)

        print bill_url

        try:
            response = requests.get(bill_url)
        except requests.exceptions.RequestException as e:
            self.warning(u'Server Error on {}'.format(bill_url))
            return False 

        html = response.text

        page = lxml.html.fromstring(html)

        if page.xpath('//div[contains(@class, "followable")]/h1/text()'):
            bill_number = page.xpath('//div[contains(@class, "followable")]/h1/text()')[0]
        else:
            self.warning(u'Server Error on {}'.format(bill_url))
            return False             

        bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[0]

        bill_summary = ''
        if page.xpath('//p[@id="pinslip"]/text()'):
            bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0]

        bill_id = re.sub(r'[^S|H|\d]','',bill_id)

        bill = Bill(session, chamber,bill_id, bill_title,
                    summary=bill_summary)
        bill.add_source(bill_url)


        #https://malegislature.gov/Bills/189/SD2739 has a presenter
        #https://malegislature.gov/Bills/189/S2168 no sponsor
        # Find the non-blank text of the dt following Sponsor or Presenter,
        # including any child link text.
        sponsor = page.xpath('//dt[text()="Sponsor:" or text()="Presenter:"]/following-sibling::dd/descendant-or-self::*/text()[normalize-space()]')
        if sponsor:
            sponsor = sponsor[0].strip()
            bill.add_sponsor('primary', sponsor)

        has_cosponsor = page.xpath('//a[starts-with(normalize-space(.),"Petitioners")]')
        if has_cosponsor:
            self.scrape_cosponsors(bill, bill_url)

        version = page.xpath("//div[contains(@class, 'modalBtnGroup')]/a[contains(text(), 'Download PDF') and not(@disabled)]/@href")
        if version:
            version_url = "https://malegislature.gov{}".format(version[0])
            bill.add_version('Bill Text', version_url,
                    mimetype='application/pdf')

        self.scrape_actions(bill, bill_url)

        self.save_bill(bill)
Exemplo n.º 25
0
    def scrape_bill_page(self, chamber, session, bill_url, bill_type):
        page = self.lxmlize(bill_url)
        author = self.get_one_xpath(page, "//a[@id='ctl00_PageBody_LinkAuthor']/text()")

        sbp = lambda x: self.scrape_bare_page(page.xpath("//a[contains(text(), '%s')]" % (x))[0].attrib["href"])

        authors = [x.text for x in sbp("Authors")]

        try:
            digests = sbp("Digests")
        except IndexError:
            digests = []

        try:
            versions = sbp("Text")
        except IndexError:
            versions = []

        title = page.xpath("//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0]
        actions = page.xpath("//div[@id='ctl00_PageBody_PanelBillInfo']/" "/table[@style='font-size:small']/tr")

        bill_id = page.xpath("//span[@id='ctl00_PageBody_LabelBillID']/text()")[0]

        bill_type = {"B": "bill", "CR": "concurrent resolution"}[bill_type[1:]]
        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(bill_url)

        authors.remove(author)
        bill.add_sponsor("primary", author)
        for author in authors:
            bill.add_sponsor("cosponsor", author)

        for digest in digests:
            bill.add_document(digest.text, digest.attrib["href"], mimetype="application/pdf")

        for version in versions:
            bill.add_version(version.text, version.attrib["href"], mimetype="application/pdf")

        flags = {"prefiled": ["bill:filed"], "referred to the committee": ["committee:referred"]}

        for action in actions:
            date, chamber, page, text = [x.text for x in action.xpath(".//td")]
            date += "/%s" % (session)  # Session is April --> June. Prefiles
            # look like they're in January at earliest.
            date = dt.datetime.strptime(date, "%m/%d/%Y")
            chamber = {"S": "upper", "H": "lower", "J": "joint"}[chamber]

            cat = []
            for flag in flags:
                if flag in text.lower():
                    cat += flags[flag]

            if cat == []:
                cat = ["other"]
            bill.add_action(chamber, text, date, cat)

        self.save_bill(bill)
Exemplo n.º 26
0
    def scrape(self, chamber, session):
        self.log(self.metadata['session_details'])
        self.site_id = self.metadata['session_details'][session]['internal_id']
        chamber_piece = {'upper': 'Senate',
                         'lower': 'House+of+Representatives'}[chamber]

        # resolutions
        # http://alisondb.legislature.state.al.us/acas/SESSResosBySelectedMatterTransResults.asp?WhichResos=Senate&TransCodes={All}&LegDay={All}%22&GetBillsTrans=Get+Resolutions+by+Transaction

        url = 'http://alisondb.legislature.state.al.us/acas/SESSBillsBySelectedMatterTransResults.asp?TransCodes={All}&LegDay={All}&WhichBills=%s' % chamber_piece

        cookie = self.refresh_session()

        agent = FakeFirefoxURLopener()
        agent.addheader('Cookie', cookie)
        page = agent.open(url)
        doc = lxml.html.fromstring(page.read())

        # bills are all their own table with cellspacing=4 (skip first)
        bill_tables = doc.xpath('//table[@cellspacing="4"]')
        for bt in bill_tables[1:]:

            # each table has 3 rows: detail row, description, blank
            details, desc, _ = bt.xpath('tr')

            # first <tr> has img, button, sponsor, topic, current house
            #   current status, committee, committee2, last action
            _, button, sponsor, topic, _, _, com1, com2, _ = details.xpath('td')

            # pull bill_id out of script tag (gross)
            bill_id = bill_id_re.search(button.text_content()).group()
            self.log(bill_id)
            oid = btn_re.search(button.text_content()).groups()[0]

            sponsor = sponsor.text_content()
            topic = topic.text_content()
            com1 = com1.text_content()
            com2 = com2.text_content()
            desc = desc.text_content()

            # create bill
            bill = Bill(session, chamber, bill_id, desc.strip(),
                        topic=topic)
            bill.add_sponsor(sponsor, 'primary')

            self.get_sponsors(bill, oid)
            self.get_actions(bill, oid)

            # craft bill URL
            session_fragment = '2010rs'
            type_fragment = 'bills'
            bill_id_fragment = bill_id.lower()
            bill_text_url = 'http://alisondb.legislature.state.al.us/acas/searchableinstruments/%s/%s/%s.htm' % (
                session_fragment, type_fragment, bill_id_fragment)
            bill.add_version('bill text', bill_text_url)

            self.save_bill(bill)
Exemplo n.º 27
0
    def scrape_bills(self, chamber, session, subjects):
        idex = bill_start_numbers(session)[chamber]
        FROM="ctl00$rilinContent$txtBillFrom"
        TO="ctl00$rilinContent$txtBillTo"
        YEAR="ctl00$rilinContent$cbYear"
        blocks = "FOO" # Ugh.
        while len(blocks) > 0:
            default_headers = get_default_headers( SEARCH_URL )
            default_headers[FROM] = idex
            default_headers[TO]   = idex + MAXQUERY
            default_headers[YEAR] = session
            idex += MAXQUERY
            #headers = urllib.urlencode( default_headers )
            blocks = self.parse_results_page(self.post(SEARCH_URL,
                                             data=default_headers).text)
            blocks = blocks[1:-1]
            blocks = self.digest_results_page(blocks)

            for block in blocks:
                bill = blocks[block]
                subs = []
                try:
                    subs = subjects[bill['bill_id']]
                except KeyError:
                    pass

                title = bill['title'][len("ENTITLED, "):]
                billid = bill['bill_id']
                try:
                    subs   = subjects[bill['bill_id']]
                except KeyError:
                    subs   = []

                for b in BILL_NAME_TRANSLATIONS:
                    if billid[:len(b)] == b:
                        billid = BILL_NAME_TRANSLATIONS[b] + \
                            billid[len(b)+1:].split()[0]

                b = Bill(session, chamber, billid, title,
                    type=self.get_type_by_name(bill['bill_id']),
                    subjects=subs
                )

                self.process_actions( bill['actions'], b )
                sponsors = bill['sponsors'][len("BY"):].strip()
                sponsors = sponsors.split(",")
                sponsors = [ s.strip() for s in sponsors ]

                for href in bill['bill_id_hrefs']:
                    b.add_version( href.text, href.attrib['href'],
                        mimetype="application/pdf" )

                for sponsor in sponsors:
                    b.add_sponsor("primary", sponsor)

                b.add_source( SEARCH_URL )
                self.save_bill(b)
Exemplo n.º 28
0
    def scrape_current(self, chamber, term):
        chamber_name = "Senate" if chamber == "upper" else "House"
        with self.urlopen(
            ksapi.url + "bill_status/"
        ) as bill_request:  # perhaps we should save this data so we can make on request for both chambers?
            bill_request_json = json.loads(bill_request)
            bills = bill_request_json["content"]
            for bill_data in bills:
                # filtering out other chambers
                bill_equal_chamber = False
                for history in bill_data["HISTORY"]:
                    if history["chamber"] == chamber_name:
                        bill_is_in_chamber = True
                if not bill_is_in_chamber:
                    continue

                    # main
                bill = Bill(term, chamber, bill_data["BILLNO"], bill_data["SHORTTITLE"])
                bill.add_source(ksapi.url + "bill_status/" + bill_data["BILLNO"].lower())
                if bill_data["LONGTITLE"]:
                    bill.add_title(bill_data["LONGTITLE"])
                bill.add_document("apn", ksapi.ksleg + bill_data["apn"])
                bill.add_version("Latest", ksapi.ksleg + bill_data["apn"])

                for sponsor in bill_data["SPONSOR_NAMES"]:
                    bill.add_sponsor("primary" if len(bill_data["SPONSOR_NAMES"]) == 1 else "cosponsor", sponsor)

                for event in bill_data["HISTORY"]:
                    if "committee_names" in event and "conferee_names" in event:
                        actor = " and ".join(bill_data["committee_names"] + bill_data["conferee_names"])
                    elif "committee_names" in history:
                        actor = " and ".join(bill_data["committee_names"])
                    elif "conferee_names" in history:
                        actor = " and ".join(bill_data["conferee_names"])
                    else:
                        actor = "upper" if chamber == "Senate" else "lower"

                    date = datetime.datetime.strptime(event["occurred_datetime"], "%Y-%m-%dT%H:%M:%S")
                    bill.add_action(actor, event["status"], date)

                    if event["action_code"] in ksapi.voted:
                        votes = votes_re.match(event["status"])
                        if votes:
                            vote = Vote(
                                chamber,
                                date,
                                votes.group(1),
                                event["action_code"] in ksapi.passed,
                                int(votes.group(2)),
                                int(votes.group(3)),
                                0,
                            )
                            vote.add_source(ksapi.ksleg + "bill_status/" + bill_data["BILLNO"].lower())
                            bill.add_vote(vote)

                self.save_bill(bill)
Exemplo n.º 29
0
    def scrape_bill(self, chamber, term, bill_id, url, title, subject=None):
        self.logger.info('GET ' + url)
        resp = self.get(url)
        html = resp.text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        bill = Bill(term, chamber, bill_id, title)
        bill.add_source(url)
        if subject is not None:
            bill['subjects'] = [subject]

        # Sponsors
        sponsor_map = {
            'author': 'primary',
            'co-author': 'cosponsor',
            'sponsor': 'cosponsor',
            'co-sponsor': 'cosponsor',
            }
        for div in doc.xpath('//div[contains(@class, "bill-author-info")]'):
            name = div.xpath('string(b)').strip()
            sp_type = sponsor_map[div.xpath('string(p)').strip().lower()]
            bill.add_sponsor(sp_type, name)

        # Actions
        for li in doc.xpath('//div[@id="bill-actions"]//li')[::-1]:
            if li.text_content() == 'None currently available.':
                continue
            chamber_str = li.xpath('string(strong)').strip()
            action_chamber = dict(H='lower', S='upper')[chamber_str]
            action_date = li.xpath('string(span[@class="document-date"])')
            # Some resolution actions have no dates.
            if not action_date.strip():
                continue
            action_date = datetime.datetime.strptime(action_date.strip(), '%m/%d/%Y')
            action_text = li.xpath('string(span[2])').strip()
            if not action_text.strip():
                continue
            kwargs = dict(date=action_date, actor=action_chamber, action=action_text)
            kwargs.update(**self.categorizer.categorize(action_text))
            bill.add_action(**kwargs)

        # Documents (including votes)
        for doc_type, doc_meta in BillDocuments(self, doc):
            if doc_type == 'version':
                bill.add_version(
                    doc_meta.title or doc_meta.text, url=doc_meta.url,
                    mimetype='application/pdf')
            elif doc_type == 'document':
                bill.add_document(doc_meta.title or doc_meta.text, url=doc_meta.url,
                    mimetype='application/pdf')
            elif doc_type == 'rollcall':
                self.add_rollcall(chamber, bill, doc_meta)

        self.save_bill(bill)
Exemplo n.º 30
0
    def scrape_assem_bills(self, chamber, insert, session, year):

        doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution',
                    6: 'joint resolution',9:'petition'}
        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
                page = self.get(page_path).text
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)
                root.make_links_absolute("http://www.leg.state.nv.us/")

                bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                title = self.get_node(
                    root,
                    '//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
                    'b[contains(text(), "By:")]]/td/em/text()')

                bill = Bill(session, chamber, bill_id, title,
                            type=bill_type)
                bill['subjects'] = list(set(self.subject_mapping[bill_id]))
                billtext = root.xpath("//b[text()='Bill Text']")[0].getparent().getnext()
                text_urls = billtext.xpath("./a")
                for text_url in text_urls:
                    version_name = text_url.text.strip()
                    version_url = text_url.attrib['href']
                    bill.add_version(version_name, version_url,
                                 mimetype='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsor('primary', leg)
                for leg in secondary:
                    bill.add_sponsor('cosponsor', leg)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes =  mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes"
                    bill.add_document(minutes_date, minutes_url)
                    minutes_count = minutes_count + 1


                self.scrape_actions(root, bill, "lower")
                self.scrape_votes(page, page_path, bill, insert, year)
                bill.add_source(page_path)
                self.save_bill(bill)
Exemplo n.º 31
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = self.urlopen(url)
        except scrapelib.HTTPError:
            self.warning("couldn't open %s, skipping bill" % url)
            return
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        header = page.xpath('//h3/br')[0].tail.replace('&nbsp;', ' ')
        title, primary_sponsor = header.split(' -- ')

        if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'):
            bill_type = ['bill']
        elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'):
            bill_type = ['resolution']
        elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'):
            bill_type = ['concurrent resolution']
        elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'):
            bill_type = ['joint resolution']

        for flag in SUB_BLACKLIST:
            if flag in bill_id:
                bill_id = bill_id.replace(flag, " ")
        bill_id = re.sub("\s+", " ", bill_id).strip()

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_sponsor('primary', primary_sponsor)
        bill.add_source(url)

        for link in page.xpath(
            '//a[contains(@href, "bills/") and text() = "HTML"]'):

            name = link.getprevious().tail.strip()
            bill.add_version(name, link.attrib['href'], mimetype="text/html")
            next = link.getnext()
            if next.text == "PDF":
                bill.add_version(name, next.attrib['href'],
                                 mimetype="application/pdf")

        for link in page.xpath(
            "//a[contains(@href, 'fnotes') and text() = 'HTML']"):

            bill.add_document("Fiscal Note", link.attrib['href'])

        subjects = []
        for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
            subjects.append(link.text.strip())
        bill['subjects'] = subjects

        status_link = page.xpath('//a[contains(@href, "billsta")]')[0]
        self.parse_status(bill, status_link.attrib['href'])

        self.save_bill(bill)
Exemplo n.º 32
0
    def get_bill_info(self, chamber, session, bill_detail_url,
                      version_list_url):
        """Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        if chamber == "House":
            chamber = 'lower'
        else:
            chamber = 'upper'

        with self.urlopen(bill_detail_url) as bill_html:
            doc = lxml.html.fromstring(bill_html)

            bill_id = doc.xpath('//title/text()')[0].split()[0]
            bill_title = doc.xpath('//font[@size=-1]/text()')[0]
            bill_type = {
                'F': 'bill',
                'R': 'resolution',
                'C': 'concurrent resolution'
            }[bill_id[1]]
            bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)
            bill['subjects'] = self._subject_mapping[bill_id]
            bill.add_source(bill_detail_url)

            # grab sponsors
            sponsors = doc.xpath(
                '//table[@summary="Show Authors"]/descendant::a/text()')
            if sponsors:
                primary_sponsor = sponsors[0].strip()
                bill.add_sponsor('primary', primary_sponsor)
                cosponsors = sponsors[1:]
                for leg in cosponsors:
                    bill.add_sponsor('cosponsor', leg.strip())

            # Add Actions performed on the bill.
            bill_actions = self.extract_bill_actions(doc, chamber)
            for action in bill_actions:
                bill.add_action(action['action_chamber'],
                                action['action_text'],
                                action['action_date'],
                                type=action['action_type'])

        # Get all versions of the bill.
        # Versions of a bill are on a separate page, linked to from the column
        # labeled, "Bill Text", on the search results page.
        with self.urlopen(version_list_url) as version_html:
            version_doc = lxml.html.fromstring(version_html)
            for v in version_doc.xpath(
                    '//a[starts-with(@href, "/bin/getbill.php")]'):
                version_url = urlparse.urljoin(VERSION_URL_BASE, v.get('href'))
                bill.add_version(v.text.strip(), version_url)

        self.save_bill(bill)
Exemplo n.º 33
0
    def scrape_bill(self, chamber, session, billid, histurl, year):
        if year[0] != 'R':
            session = year
        else:
            session = self.metadata['session_details'][year]['sub_sessions'][
                int(year[0]) - 1]

        with self.urlopen(histurl) as data:
            soup = BeautifulSoup(cleansource(data))
            basicinfo = soup.findAll('div', id='bhistleft')[0]
            hist = basicinfo.table

            sponsor = None
            title = None
            for b in basicinfo.findAll('b'):
                if b.next.startswith('SUMMARY'):
                    title = b.findNextSiblings(text=True)[0].strip()
                elif b.next.startswith('SPONSOR'):
                    for a in b.findNextSiblings('a'):
                        if not issponsorlink(a):
                            break
                        sponsor = cleansponsor(a.contents[0])

            bill = Bill(session, chamber, billid, title)

            if sponsor:
                bill.add_sponsor('primary', sponsor)

            for row in hist.findAll('tr'):
                link = row.td.a
                vlink = urlbase % link['href']
                vname = link.contents[0].strip()
                bill.add_version(vname, vlink)

            history = soup.findAll('div', id='bhisttab')[0].table
            rows = history.findAll('tr')[1:]
            for row in rows:
                tds = row.findAll('td')
                if len(tds) < 2:
                    # This is not actually an action
                    continue
                date, action = row.findAll('td')[:2]
                date = dt.datetime.strptime(date.contents[0], '%m/%d/%y')
                action = action.contents[0].strip()
                if 'House' in action:
                    actor = 'lower'
                elif 'Senate' in action:
                    actor = 'upper'
                else:  # for lack of a better
                    actor = chamber

                bill.add_action(actor, action, date)

        self.save_bill(bill)
Exemplo n.º 34
0
    def scrape_senate_bills(self, chamber, insert, session, year):
        doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution',
                    8: 'joint resolution'}

        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)

                page = self.get(page_path).text
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)

                bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                title = self.get_node(
                    root,
                    '//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
                    'b[contains(text(), "By:")]]/td/em/text()')

                bill = Bill(session, chamber, bill_id, title,
                            type=bill_type)
                bill['subjects'] = list(set(self.subject_mapping[bill_id]))

                for table in root.xpath('//div[@id="content"]/table'):
                    if 'Bill Text' in table.text_content():
                        bill_text = table.xpath("string(tr/td[2]/a/@href)")
                        text_url = "http://www.leg.state.nv.us" + bill_text
                        bill.add_version("Bill Text", text_url,
                                         mimetype='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsor('primary', leg)
                for leg in secondary:
                    bill.add_sponsor('cosponsor', leg)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes =  mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda"
                    bill.add_document(minutes_date, minutes_url)
                    minutes_count = minutes_count + 1

                self.scrape_actions(root, bill, "upper")
                self.scrape_votes(page, page_path, bill, insert, year)
                bill.add_source(page_path)
                self.save_bill(bill)
Exemplo n.º 35
0
    def scrape_bill(self, session, chamber, bill_type, bill_url):
        with self.urlopen(bill_url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(bill_url)

            # split "SB1 SD2 HD2" to get SB1
            bill_id = page.xpath('//a[@id="LinkButtonMeasure"]')[0].text_content().split()[0]

            title = page.xpath('//span[@id="ListView1_ctrl0_measure_titleLabel"]')[0].text
            subjects = page.xpath('//span[@id="ListView1_ctrl0_report_titleLabel"]')[0].text.split('; ')
            subjects = [s.strip() for s in subjects if s.strip()]
            description = page.xpath('//span[@id="ListView1_ctrl0_descriptionLabel"]')[0].text
            sponsors = page.xpath('//span[@id="ListView1_ctrl0_introducerLabel"]')[0].text
            referral = page.xpath('//span[contains(@id, "referral")]/text()')[0]

            bill = Bill(session, chamber, bill_id, title, subjects=subjects,
                        type=bill_type, description=description, referral=referral)
            for sponsor in sponsors.split(', '):
                if sponsor.endswith(' (BR)'):
                    sponsor = sponsor[:-5]
                bill.add_sponsor('primary', sponsor)

            # actions
            actions = []

            table = page.xpath('//table[@id="GridViewStatus"]')[0]
            for row in table.xpath('tr'):
                action_params = {}
                cells = row.xpath('td')
                if len(cells) == 3:
                    ch = cells[1].xpath('font')[0].text
                    action_params['actor'] = house[ch]
                    action_params['action'] = cells[2].xpath('font')[0].text
                    action_date = cells[0].xpath('font')[0].text
                    action_params['date'] = datetime.strptime(action_date, "%m/%d/%Y")
                    action_params['type'] = categorize_action(action_params['action'])
                    actions.append(action_params)
            for action_params in actions:
                bill.add_action(**action_params)

                self.parse_vote(bill, action_params['action'],
                                action_params['actor'], action_params['date'])

            # add versions
            try:
                for version in page.xpath('//a[contains(@id, "StatusLink")]'):
                    bill.add_version(version.text.replace('_', ' '),
                                     version.get('href'))
            except IndexError: # href not found.
                pass

        bill.add_source(bill_url)
        self.save_bill(bill)
Exemplo n.º 36
0
    def scrape_bill(self, session, chamber, bill_id, short_title, url):
        if bill_id in ['SCR 0003', 'SB 0251', 'SB 0292']:
            return

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath("//br")[8].tail
            if not title:
                title = short_title
            title = title.strip()

            abbrev = bill_id.split()[0]
            if abbrev.endswith('B'):
                bill_type = ['bill']
            elif abbrev.endswith('JR'):
                bill_type = ['joint resolution']
            elif abbrev.endswith('CR'):
                bill_type = ['concurrent resolution']
            elif abbrev.endswith('R'):
                bill_type = ['resolution']

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill.add_source(url)

            action_link = page.xpath("//a[contains(@href, 'getActions')]")[0]
            self.scrape_actions(bill, action_link.attrib['href'])

            version_path = "//a[contains(., '%s')]"
            for version_type in ('Introduced Bill', 'House Bill',
                                 'Senate Bill', 'Engrossed Bill',
                                 'Enrolled Act'):
                path = version_path % version_type
                links = page.xpath(path)
                if links:
                    bill.add_version(version_type, links[0].attrib['href'])

            for vote_link in page.xpath("//a[contains(@href, 'Srollcal')]"):
                self.scrape_senate_vote(bill, vote_link.attrib['href'])

            for vote_link in page.xpath("//a[contains(@href, 'Hrollcal')]"):
                self.scrape_house_vote(bill, vote_link.attrib['href'])

            for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"):
                num = doc_link.text.strip().split("(")[0]
                bill.add_document("Fiscal Impact Statement #%s" % num,
                                  doc_link.attrib['href'])

            bill['subjects'] = self.subjects[bill_id]

            self.save_bill(bill)
Exemplo n.º 37
0
    def scrape_bill(self, chamber, session, bill_id):
        biennium = "%s-%s" % (session[0:4], session[7:9])
        bill_num = bill_id.split()[1]

        url = ("%s/GetLegislation?biennium=%s&billNumber"
               "=%s" % (self._base_url, biennium, bill_num))

        page = self.urlopen(url)
        page = lxml.etree.fromstring(page.bytes)
        page = xpath(page, "//wa:Legislation")[0]

        title = xpath(page, "string(wa:LongDescription)")

        bill_type = xpath(
            page, "string(wa:ShortLegislationType/wa:LongLegislationType)")
        bill_type = bill_type.lower()

        if bill_type == 'gubernatorial appointment':
            return

        bill = Bill(session, chamber, bill_id, title, type=[bill_type])

        fake_source = ("http://apps.leg.wa.gov/billinfo/"
                       "summary.aspx?bill=%s&year=%s" %
                       (bill_num, session[0:4]))
        bill.add_source(fake_source)

        chamber_name = {'lower': 'House', 'upper': 'Senate'}[chamber]
        mimetype = 'text/html'
        version_url = ("http://www.leg.wa.gov/pub/billinfo/%s/"
                       "Htm/Bills/%s %ss/%s.htm" %
                       (biennium, chamber_name, bill_type.title(), bill_num))

        # Sometimes the measure's version_url isn't guessable. When that happens
        # have to get the url from the source page.
        version_resp = self.get(version_url)
        if version_resp.status_code != 200:
            webpage = self.get(fake_source).text
            webdoc = lxml.html.fromstring(webpage)
            version_url = webdoc.xpath(
                '//a[contains(@href, "billdocs")]/@href')[-1]
            if version_url.lower().endswith('.pdf'):
                mimetype = 'application/pdf'

        bill.add_version(bill_id, version_url, mimetype=mimetype)

        self.scrape_sponsors(bill)
        self.scrape_actions(bill, bill_num)
        self.scrape_votes(bill)
        self.fix_prefiled_action_dates(bill)

        return bill
Exemplo n.º 38
0
    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        bill = Bill(data['legislative_session'],
                    chamber,
                    data['identifier'],
                    data['title'],
                    subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(
                action['organization_id'])['classification']
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']))
            # TODO: related entities

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(
                sponsor['classification'],
                sponsor['name'],
            )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'],
                                 link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'],
                                  link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']))

        for title in data['other_titles']:
            bill.add_title(title)

        # TODO: related bills
        # for related in data['related_bills']:

        self.save_bill(bill)
Exemplo n.º 39
0
    def scrape1995(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1995_96/leg/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Bill
            name = page.cssselect('h3 br')[0].tail.split('-', 1)[1].strip()
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version('Current',
                             url.replace('/sum/', '/fulltext/'),
                             mimetype='text/html')

            # Sponsorships
            rows = page.cssselect('center table tr')
            for row in rows:
                if row.text_content().strip() == 'Sponsor and CoSponsors':
                    continue
                if row.text_content().strip() == 'Links / Committees / Status':
                    break
                for a in row.cssselect('a'):
                    bill.add_sponsor('', a.text_content().strip())

            # Actions
            # The actions are in a pre table that looks like:
            """    SENATE                         HOUSE
                   -------------------------------------
                 1/13/95   Read 1st time          2/6/95
                 1/31/95   Favorably Reported
                 2/1/95    Read 2nd Time          2/7/95
                 2/3/95    Read 3rd Time
                 2/3/95    Passed/Adopted                   """

            actions = page.cssselect('pre')[0].text_content().split('\n')
            actions = actions[2:]
            for action in actions:
                senate_date = action[:22].strip()
                action_text = action[23:46].strip()
                house_date = action[46:].strip()

                if '/' not in senate_date and '/' not in house_date:
                    continue

                if senate_date:
                    bill.add_action('upper', action_text, senate_date)

                if house_date:
                    bill.add_action('lower', action_text, house_date)

            self.save_bill(bill)
Exemplo n.º 40
0
    def scrape_senate_bills(self, chamber, insert, session, year):
        doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution',
                    8: 'joint resolution'}

        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)

                with self.urlopen(page_path) as page:
                    page = page.decode("utf8").replace(u"\xa0", " ")
                    root = lxml.html.fromstring(page)

                    bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                    title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)')

                    bill = Bill(session, chamber, bill_id, title,
                                type=bill_type)
                    bill['subjects'] = self.subject_mapping[bill_id]

                    bill_text = root.xpath("string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)")
                    text_url = "http://www.leg.state.nv.us" + bill_text
                    bill.add_version("Bill Text", text_url)

                    primary, secondary = self.scrape_sponsors(page)

                    for leg in primary:
                        bill.add_sponsor('primary', leg)
                    for leg in secondary:
                        bill.add_sponsor('cosponsor', leg)


                    minutes_count = 2
                    for mr in root.xpath('//table[4]/tr/td[3]/a'):
                        minutes =  mr.xpath("string(@href)")
                        minutes_url = "http://www.leg.state.nv.us" + minutes
                        minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                        minutes_date = mr.xpath(minutes_date_path).split()
                        minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes"
                        bill.add_document(minutes_date, minutes_url)
                        minutes_count = minutes_count + 1

                    self.scrape_actions(root, bill, "upper")
                    self.scrape_votes(page, bill, insert, year)
                    bill.add_source(page_path)
                    self.save_bill(bill)
Exemplo n.º 41
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == 'upper':
            bill_no = 1
            abbr = 'SB'
        else:
            bill_no = 4001
            abbr = 'HB'
        while True:
            bill_page = self.scrape_bill(session, abbr, bill_no)
            bill_page = BeautifulSoup(bill_page)
            # if we can't find a page, we must be done. This is a healthy thing.
            if bill_page == None: return
            title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0]))
            title = title.replace('\n','').replace('\r','')
            bill_id = "%s %d" % (abbr, bill_no)

            the_bill = Bill(session, chamber, bill_id, title)

            #sponsors
            first = 0
            for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'):
                the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string)
                first = 1

            #versions
            for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'):
                r = self.parse_doc(the_bill, doc)
                if r: the_bill.add_version(*r)

            #documents
            if 'frg_billstatus_HlaTable' in str(bill_page):
                for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'):
                    r = self.parse_doc(the_bill, doc)
                    if r: the_bill.add_document(*r)
            if 'frg_billstatus_SfaSection' in str(bill_page):
                for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'):
                    r = self.parse_doc(the_bill, doc)
                    if r: the_bill.add_document(*r)

            self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0])
            self.save_bill(the_bill)
            bill_no = bill_no + 1
        pass
Exemplo n.º 42
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt"
        page = self.get(url).text
        page = unicode_csv_reader(StringIO.StringIO(page), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]
            if bill_chamber != chamber:
                continue

            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial resolution',
                'CMR': 'concurrent memorial resolution'}[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(session, chamber, bill_id, row[3], type=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]
            if primary:
                bill.add_sponsor('primary', primary)

            # ftp://www.arkleg.state.ar.us/Bills/
            # TODO: Keep on eye on this post 2017 to see if they apply R going forward.
            session_code = '2017R' if session == '2017' else session

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/%s.pdf" % (
                               session_code, bill_id.replace(' ', '')))
            bill.add_version(bill_id, version_url, mimetype='application/pdf')

            self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
Exemplo n.º 43
0
    def scrape_bill(self, chamber, session, bill_id):
        biennium = "%s-%s" % (session[0:4], session[7:9])
        bill_num = bill_id.split()[1]

        url = ("%s/GetLegislation?biennium=%s&billNumber"
               "=%s" % (self._base_url, biennium, bill_num))

        page = self.urlopen(url)
        page = lxml.etree.fromstring(page.bytes)
        page = xpath(page, "//wa:Legislation")[0]

        title = xpath(page, "string(wa:LongDescription)")

        bill_type = xpath(
            page,
            "string(wa:ShortLegislationType/wa:LongLegislationType)")
        bill_type = bill_type.lower()

        if bill_type == 'gubernatorial appointment':
            return

        bill = Bill(session, chamber, bill_id, title,
                    type=[bill_type])

        chamber_name = {'lower': 'House', 'upper': 'Senate'}[chamber]
        version_url = ("http://www.leg.wa.gov/pub/billinfo/%s/"
                       "Htm/Bills/%s %ss/%s.htm" % (biennium,
                                                    chamber_name,
                                                    bill_type.title(),
                                                    bill_num))
        bill.add_version(bill_id, version_url, mimetype='text/html')

        fake_source = ("http://apps.leg.wa.gov/billinfo/"
                       "summary.aspx?bill=%s&year=%s" % (
                           bill_num, session[0:4]))
        bill.add_source(fake_source)

        self.scrape_sponsors(bill)
        self.scrape_actions(bill, bill_num)
        self.scrape_votes(bill)
        self.fix_prefiled_action_dates(bill)

        return bill
Exemplo n.º 44
0
    def scrape(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billreference/"
               "BillReference.aspx?type=%s" % (session, chamber_abbrev))
        page = self.lxmlize(url)

        for tr in page.xpath(
                "//table[contains(@id,'cphContent_gvBills')]//tr")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session, chamber, bill_id, title, type=bill_type)

            self.scrape_digest(bill)

            # versions
            for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') +
                      tr.xpath('td[12]//a')):
                # skip references to other bills
                if a.text.startswith('See'):
                    continue
                bill.add_version(a.text,
                                 a.get('href'),
                                 mimetype='application/pdf')

            # documents
            fnote = tr.xpath('td[9]//a')
            if fnote:
                bill.add_document('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[14]//a')
            if summary:
                bill.add_document('Summary', summary[0].get('href'))

            bill.add_source(url)
            self.save_bill(bill)
Exemplo n.º 45
0
    def scrape_bill(self, chamber, session, bill_id, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            header = page.xpath('//h3/br')[0].tail.replace('&nbsp;', ' ')
            title, primary_sponsor = header.split(' -- ')

            if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'):
                bill_type = ['bill']
            elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'):
                bill_type = ['resolution']
            elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'):
                bill_type = ['concurrent resolution']
            elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'):
                bill_type = ['joint resolution']

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill.add_sponsor('primary', primary_sponsor)
            bill.add_source(url)

            for link in page.xpath(
                    '//a[contains(@href, "bills/") and text() = "HTML"]'):

                name = link.getprevious().tail.strip()
                bill.add_version(name, link.attrib['href'])

            for link in page.xpath(
                    "//a[contains(@href, 'fnotes') and text() = 'HTML']"):

                bill.add_document("Fiscal Note", link.attrib['href'])

            subjects = []
            for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
                subjects.append(link.text.strip())
            bill['subjects'] = subjects

            status_link = page.xpath('//a[contains(@href, "billsta")]')[0]
            self.parse_status(bill, status_link.attrib['href'])

            self.save_bill(bill)
Exemplo n.º 46
0
    def scrape(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billindex/"
               "BillCrossRef.aspx?type=%s" % (session, chamber_abbrev))
        page = lxml.html.fromstring(self.urlopen(url))

        for tr in page.xpath("//tr[@valign='middle']")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session, chamber, bill_id, title, type=bill_type)

            self.scrape_digest(bill)

            # versions
            for a in (tr.xpath('td[6]//a') + tr.xpath('td[9]//a') +
                      tr.xpath('td[10]//a')):
                # skip references to other bills
                if a.text.startswith('See'):
                    continue
                bill.add_version(a.text,
                                 a.get('href'),
                                 mimetype='application/pdf')

            # documents
            fnote = tr.xpath('td[7]//a')
            if fnote:
                bill.add_document('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[12]//a')
            if summary:
                bill.add_document('Summary', summary[0].get('href'))

            bill.add_source(url)
            self.save_bill(bill)
Exemplo n.º 47
0
def parse_bill(scraper, url):
    """Given a bill status URL, return a fully loaded Bill object, except for votes, which
       are expected to be handled externally.
    """
    session = extract_session(url)
    chamber = chamber_for_doctype(extract_doctype(url))
    s = get_soup(scraper, url)
    bill_id = extract_bill_id(s)
    landmark = s(text=re.compile(".*Short Description.*"))
    name_span = landmark[0].findParent().findNextSibling()
    bill_name = get_text(name_span)
    bill = Bill(session, chamber, bill_id, bill_name.strip(),status_url=url)
    actions = extract_actions(s)
    for chamber,action,date in actions:
        bill.add_action(chamber,action,date) #kwargs are permitted if we have 'em.  
    sponsor_dict = extract_sponsors_from_actions([action[1] for action in actions])
    for type,namelist in sponsor_dict.iteritems():
        for name in namelist:
            bill.add_sponsor(type,name)
    for name,link in extract_versions(scraper, s):
        bill.add_version(name,link)
    return bill
Exemplo n.º 48
0
    def scrape1997(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1997_98/leg/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Grab the interesting tables on the page.
            tables = []
            for table in page.cssselect('center table'):
                if table.get('border') == '5':
                    tables.append(table)

            # Bill
            name = page.cssselect(
                'tr > td > font > b')[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version('Current',
                             url.replace('/sum/', '/fulltext/'),
                             mimetype='text/html')

            # Sponsorships
            for a in tables[0].cssselect('a'):
                if a.text_content().strip() == 'Current':
                    break
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in tables[1].cssselect('tr'):
                senate_date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                house_date = row[2].text_content().strip()
                if '/' not in senate_date and '/' not in house_date:
                    continue
                if senate_date:
                    bill.add_action('upper', action_text, senate_date)
                if house_date:
                    bill.add_action('lower', action_text, house_date)

            self.save_bill(bill)
Exemplo n.º 49
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt"
        page = self.urlopen(url)
        page = unicode_csv_reader(StringIO.StringIO(page), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]
            if bill_chamber != chamber:
                continue

            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial resolution',
                'CMR': 'concurrent memorial resolution'
            }[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(session, chamber, bill_id, row[3], type=bill_type)
            bill.add_source(url)
            bill.add_sponsor('primary', row[11])

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/%s.pdf" %
                           (session, bill_id.replace(' ', '')))
            bill.add_version(bill_id, version_url)

            self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
Exemplo n.º 50
0
    def scrape2001(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2001_02/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Grab the interesting tables on the page.
            tables = page.cssselect('table center table')

            # Bill
            name = tables[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Sponsorships
            for a in tables[1].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            center = page.cssselect('table center')[-1]

            for row in center.cssselect('table table')[0].cssselect('tr')[2:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                if '/' not in date:
                    continue
                if action_text.startswith('Senate'):
                    action_text = action_text.split(' ', 1)[1].strip()
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    action_text = action_text.split(' ', 1)[1].strip()
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in center.cssselect('table table')[1].cssselect('a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')),
                                 mimetype='text/html')

            self.save_bill(bill)
Exemplo n.º 51
0
    def scrape_bill(self,
                    url,
                    kw,
                    re_amendment=re.compile(r'(^[A-Z]A \d{1,3}) to'),
                    re_substitution=re.compile(r'(^[A-Z]S \d{1,2}) for'),
                    re_digits=re.compile(r'\d{,5}'),
                    actions_categorize=actions.categorize,
                    actions_get_actor=actions.get_actor):

        bill = Bill(**kw)
        bill.add_source(url)

        #---------------------------------------------------------------------
        # A few helpers.
        _url_2_lxml = self._url_2_lxml
        _cleanup_sponsors = self._cleanup_sponsors

        # Shortcut function partial to get text at a particular xpath:
        doc = _url_2_lxml(url)
        _get_text = partial(get_text, doc, 0)

        # Get session number--needed for fetching related documents (see below).
        xpath = '//font[contains(., "General Assembly") and @face="Arial"]'
        session_num = doc.xpath(xpath)[0].text_content()
        session_num = re_digits.match(session_num).group()

        #---------------------------------------------------------------------
        # Sponsors
        chamber = bill['chamber']

        sponsor_types = {
            'Additional Sponsor(s):': 'cosponsor',
            'CoSponsors:': 'cosponsor',
            'Primary Sponsor:': 'primary'
        }

        xpath = '//font[contains(., "Sponsor") and @color="#008080"]'
        headings = doc.xpath(xpath + '/text()')
        sponsors = doc.xpath(xpath +
                             '/../../following-sibling::td/font/text()')

        for h, s in zip(headings, sponsors):

            names = _cleanup_sponsors(s, chamber)
            type_ = sponsor_types[h.strip()]

            if names:
                for name, _chamber in names:
                    bill.add_sponsor(type_, name, chamber=_chamber)

        #---------------------------------------------------------------------
        # Versions

        tmp = '/'.join([
            'http://www.legis.delaware.gov',
            'LIS/lis{session_num}.nsf/vwLegislation',
            '{moniker}/$file/{filename}{format_}?open'
        ])

        documents = self.scrape_documents(source=url,
                                          docname="introduced",
                                          filename="Legis",
                                          tmp=tmp,
                                          session_num=session_num)

        for d in documents:
            bill.add_version(**d)

        # If bill is a substitution, add the original as a version.
        names = doc.xpath('//*[contains(text(), "Substituted '
                          'Legislation for Bill:")]/text()')
        urls = doc.xpath('//*[contains(text(), "Substituted '
                         'Legislation for Bill:")]'
                         '/following-sibling::a/@href')

        for name, url in zip(names, urls):

            name = re_substitution.match(name).group(1)
            bill.add_version(name, url, description='original bill')

        #---------------------------------------------------------------------
        # Actions
        actions = doc.xpath('//font[contains(., "Actions History")]'
                            '/../following-sibling::table/descendant::td[2]')
        actions = actions[0].text_content()
        actions = filter(None, actions.splitlines())

        for a in reversed(actions):
            date, action = a.split(' - ', 1)
            try:
                date = datetime.strptime(date, '%b %d, %Y')
            except ValueError:
                date = datetime.strptime(date, '%B %d, %Y')  # XXX: ugh.

            actor = actions_get_actor(action, bill['chamber'])
            type_ = actions_categorize(action)
            bill.add_action(actor, action, date, type_)

        #---------------------------------------------------------------------
        # Votes
        vote_strings = doc.xpath('//*[contains(text(), "vote:")]/text()')

        # Sometimes vote strings are contained in weird, separate elements. Probably
        # hand edited.
        if not all(re.search('\d', string) for string in vote_strings):
            # Use the parent's text_content instead.
            vote_strings = []
            for el in doc.xpath('//*[contains(text(), "vote:")]/..'):
                vote_strings.append(el.text_content())

        vote_urls = doc.xpath('//*[contains(text(), "vote:")]'
                              '/following-sibling::a/@href')
        for string, url in zip(vote_strings, vote_urls):

            vote_data = parse_votestring(string)
            vote = self.scrape_vote(url, **vote_data)
            if vote:
                bill.add_vote(vote)

        #---------------------------------------------------------------------
        # Amendments
        xpath = ("//font[contains(., 'Amendments')]/" "../../../td[2]/font/a")

        tmp = ('http://www.legis.delaware.gov/LIS/lis{session_num}.nsf/'
               'vwLegislation/{id_}/$file/{filename}{format_}?open')

        for source, id_ in zip(doc.xpath(xpath + '/@href'),
                               doc.xpath(xpath + '/text()')):

            short_id = re_amendment.match(id_).group(1)

            documents = self.scrape_documents(source=source,
                                              docname='amendment (%s)' %
                                              short_id,
                                              filename='Legis',
                                              tmp=tmp,
                                              session_num=session_num,
                                              id_=id_)

            for d in documents:
                bill.add_document(**d)

        #---------------------------------------------------------------------
        # Add any related "Engrossments".
        # See www.ncsl.org/documents/legismgt/ILP/98Tab3Pt4.pdf for
        # an explanation of the engrossment process in DE.
        source = doc.xpath('//img[@alt="Engrossment"]/../@href')

        if source:

            tmp = '/'.join([
                'http://www.legis.delaware.gov',
                'LIS/lis{session_num}.nsf/EngrossmentsforLookup',
                '{moniker}/$file/{filename}{format_}?open'
            ])

            documents = self.scrape_documents(source=source[0],
                                              docname="Engrossment",
                                              filename="Engross",
                                              tmp=tmp,
                                              session_num=session_num,
                                              id_=bill['bill_id'])

            for d in documents:
                bill.add_version(**d)

        # --------------------------------------------------------------------
        # Add any fiscal notes.
        source = doc.xpath("//img[@alt='Fiscal Note']/../@href")

        if source:

            tmp = '/'.join([
                'http://www.legis.delaware.gov',
                'LIS/lis{session_num}.nsf/FiscalforLookup',
                '{docnum}/$file/{filename}{format_}?open'
            ])

            documents = self.scrape_documents(source=source[0],
                                              docname="Fiscal Note",
                                              filename="Fiscal",
                                              tmp=tmp,
                                              session_num=session_num)

            for d in documents:
                bill.add_document(**d)

        #---------------------------------------------------------------------
        # Extra fields

        # Helper to get the first td sibling of certain nodes.
        tmp = '//font[contains(., "%s")]/../../../td[2]'
        first_sibling_text = lambda heading: _get_text(tmp % heading)

        extra_fields = {
            # A long description of the legislation.
            "summary": "Synopsis",
            # Codification details for enacted legislation.
            "volume_chapter": "Volume Chapter",
            # Presumably the date of approval/veto.
            "date_governor_acted": "Date Governor Acted",
            "fiscal_notes": "Fiscal Notes",
        }

        for key, name in extra_fields.iteritems():
            try:
                bill[key] = first_sibling_text(name)
            except IndexError:
                # xpath lookup failed.
                pass

        self.save_bill(bill)
Exemplo n.º 52
0
    def scrape_bill(self, chamber, session, bill_id, bill_type, url):
        doc = lxml.html.fromstring(self.get(url).text)
        doc.make_links_absolute(url)

        title = doc.xpath('//b[text()="TITLE:"]')
        if title:
            title = title[0].tail.strip().strip('"')
        else:
            self.warning("skipping bill %s, no information" % url)
            return

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)

        # Get sponsors
        spons_str = doc.xpath(
            '//b[contains(text(), "SPONSOR")]')[0].tail.strip()
        sponsors_match = re.match(
            '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str)
        if sponsors_match:
            sponsors = sponsors_match.group(2).split(',')
            sponsor = sponsors[0].strip()

            if sponsor:
                bill.add_sponsor('primary', sponsors[0])

            for sponsor in sponsors[1:]:
                sponsor = sponsor.strip()
                if sponsor:
                    bill.add_sponsor('cosponsor', sponsor)
        else:
            # Committee sponsorship
            spons_str = spons_str.strip()

            if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str):
                spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '',
                                   spons_str).title()
                spons_str = (spons_str +
                             " Committee (by request of the governor)")

            if spons_str:
                bill.add_sponsor('primary', spons_str)

        # Get actions from second myth table
        self._current_comm = None
        act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:]
        for row in act_rows:
            date, journal, raw_chamber, action = row.xpath('td')

            act_date = datetime.datetime.strptime(date.text_content().strip(),
                                                  '%m/%d/%y')
            raw_chamber = raw_chamber.text_content().strip()
            action = action.text_content().strip()

            if raw_chamber == "(H)":
                act_chamber = "lower"
            elif raw_chamber == "(S)":
                act_chamber = "upper"

            if re.match("\w+ Y(\d+)", action):
                vote_href = journal.xpath('.//a/@href')
                if vote_href:
                    self.parse_vote(bill, action, act_chamber, act_date,
                                    vote_href[0])

            action, atype = self.clean_action(action)

            match = re.match('^Prefile released (\d+/\d+/\d+)$', action)
            if match:
                action = 'Prefile released'
                act_date = datetime.datetime.strptime(match.group(1),
                                                      '%m/%d/%y')

            bill.add_action(act_chamber, action, act_date, type=atype)

        # Get subjects
        bill['subjects'] = []
        for subj in doc.xpath('//a[contains(@href, "subject")]/text()'):
            bill['subjects'].append(subj.strip())

        # Get versions
        text_list_url = "http://www.legis.state.ak.us/"\
            "basis/get_fulltext.asp?session=%s&bill=%s" % (
            session, bill_id)
        bill.add_source(text_list_url)

        text_doc = lxml.html.fromstring(self.get(text_list_url).text)
        text_doc.make_links_absolute(text_list_url)
        for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'):
            name = link.xpath('../preceding-sibling::td/text()')[0].strip()
            text_url = link.get('href')
            bill.add_version(name, text_url, mimetype="text/html")

        # Get documents
        doc_list_url = "http://www.legis.state.ak.us/"\
                "basis/get_documents.asp?session=%s&bill=%s" % (
                    session, bill_id )
        doc_list = lxml.html.fromstring(self.get(doc_list_url).text)
        doc_list.make_links_absolute(doc_list_url)
        bill.add_source(doc_list_url)
        for href in doc_list.xpath(
                '//a[contains(@href, "get_documents")][@onclick]'):
            h_name = href.text_content()
            h_href = href.attrib['href']
            if h_name.strip():
                bill.add_document(h_name, h_href)

        self.save_bill(bill)
Exemplo n.º 53
0
    def scrape_bill_pages(self, session, year_abr):
        """ assemble information on a bill from a number of DBF files
        """

        #Main Bill information
        main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(str(session), chamber, bill_id, title,
                        type=self._bill_types[bill_type[1:]])
            bill.add_source(main_bill_url)
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON')

        for rec in bill_sponsors_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            name = rec["sponsor"]
            sponsor_type = rec["type"]
            if sponsor_type == 'P':
                sponsor_type = "Primary"
            else:
                sponsor_type = "Co-sponsor"
            bill.add_sponsor(sponsor_type, name)


        #Documents
        bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP')

        #print bill_document_db[2]
        for rec in bill_document_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            document = rec["document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr,
                document.replace('.DOC', '.HTM'))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec['doctype']]
            except KeyError:
                raise Exception('unknown doctype %s on %s' %
                                (rec['doctype'], bill_id))
            if rec['comment']:
                doc_name += ' ' + rec['comment']

            if rec['doctype'] in self._version_types:
                bill.add_version(doc_name, htm_url)
            else:
                bill.add_document(doc_name, htm_url)

        # Votes
        next_year = int(year_abr)+1
        vote_info_list = ['A%s' % year_abr,
                          'A%s' % next_year,
                          'S%s' % year_abr,
                          'S%s' % next_year,
                          'CA%s-%s' % (year_abr, next_year),
                          'CS%s-%s' % (year_abr, next_year),
                         ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            s_vote_zip, resp = self.urlretrieve(s_vote_url)
            zipedfile = zipfile.ZipFile(s_vote_zip)
            vfile = "%s.txt" % filename
            vote_file = zipedfile.open(vfile, 'U')
            vdict_file = csv.DictReader(vote_file)

            votes = {}
            if filename.startswith('A') or filename.startswith('CA'):
                chamber = "lower"
            else:
                chamber = "upper"

            if filename.startswith('C'):
                vote_file_type = 'committee'
            else:
                vote_file_type = 'chamber'

            for rec in vdict_file:

                if vote_file_type == 'chamber':
                    bill_id = rec["Bill"].strip()
                    leg = rec["Full_Name"]

                    date = rec["Session_Date"]
                    action = rec["Action"]
                    leg_vote = rec["Legislator_Vote"]
                else:
                    bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number'])
                    leg = rec['Name']
                    # drop time portion
                    date = rec['Agenda_Date'].split()[0]
                    # make motion readable
                    action = self._com_vote_motions[rec['BillAction']]
                    # first char (Y/N) use [0:1] to ignore ''
                    leg_vote = rec['LegislatorVote'][0:1]

                date = datetime.strptime(date, "%m/%d/%Y")
                vote_id = '_'.join((bill_id, chamber, action))
                vote_id = vote_id.replace(" ", "_")

                if vote_id not in votes:
                    votes[vote_id] = Vote(chamber, date, action, None, None,
                                          None, None, bill_id=bill_id)
                if vote_file_type == 'committee':
                    votes[vote_id]['committee'] = self._committees[
                        rec['Committee_House']]

                if leg_vote == "Y":
                    votes[vote_id].yes(leg)
                elif leg_vote == "N":
                    votes[vote_id].no(leg)
                else:
                    votes[vote_id].other(leg)

            # remove temp file
            os.remove(s_vote_zip)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count
                if vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST')


        for rec in bill_action_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            action = rec["action"]
            date = rec["dateaction"]
            actor = rec["house"]
            comment = rec["comment"]
            action, atype = self.categorize_action(action)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ')
        for rec in subject_db:
            bill_id = rec['billtype'] + str(int(rec['billnumber']))
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['subjectkey'])
            else:
                self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id)

        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            bill.add_source(bill_sponsors_url)
            bill.add_source(bill_document_url)
            bill.add_source(bill_action_url)
            bill.add_source(subject_url)
            self.save_bill(bill)
Exemplo n.º 54
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            # search for Titulo, accent over i messes up lxml, so use 'tulo'
            title = doc.xpath(
                u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()'
            )
            if not title:
                raise NoSuchBill()
            bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
            author = doc.xpath(
                u'//td/b[contains(text(),"Autor")]/../text()')[0]
            for aname in author.split(','):
                bill.add_sponsor('primary', aname.strip())

            co_authors = doc.xpath(
                u'//td/b[contains(text(),"Co-autor")]/../text()')
            if len(co_authors) != 0:
                for co_author in co_authors[1].split(','):
                    bill.add_sponsor('cosponsor', co_author.strip())

            action_table = doc.xpath('//table')[-1]
            for row in action_table[1:]:
                tds = row.xpath('td')

                # ignore row missing date
                if len(tds) != 2:
                    continue

                date = datetime.datetime.strptime(tds[0].text_content(),
                                                  "%m/%d/%Y")

                action = tds[1].text_content().strip()
                #parse the text to see if it's a new version or a unrelated document
                #if has - let's *shrug* assume it's a vote document

                #get url of action
                action_url = tds[1].xpath('a/@href')

                #check it has a url and is not just text

                if action_url:
                    action_url = action_url[0]
                    #check if it's a version of the bill or another type of document.
                    #NOTE: not sure if new versions of the bill are only denoted with 'Entirillado' OR if that's the correct name but from what i gather it looks like it.
                    if re.match('Entirillado', action):
                        bill.add_version(action, action_url)
                    else:
                        bill.add_document(action, action_url)

                for pattern, atype in _classifiers:
                    if re.match(pattern, action):
                        break
                else:
                    atype = 'other'

                bill.add_action(chamber, action, date, type=atype)

                if atype == 'bill:passed' and action_url:
                    vote_chamber = None
                    for pattern, vote_chamber in _voteChambers:
                        if re.match(pattern, action):
                            break
                    else:
                        self.warning('coudnt find voteChamber pattern')

                    if vote_chamber == 'lower' and len(action_url) > 0:
                        vote = self.scrape_votes(action_url, action, date,
                                                 vote_chamber)
                        if not vote[0] == None:
                            vote[0].add_source(action_url)
                            bill.add_vote(vote[0])
                        else:
                            self.warning('Problem Reading vote: %s,%s' %
                                         (vote[1], bill_id))

            bill.add_source(url)
            self.save_bill(bill)
Exemplo n.º 55
0
    def scrape_bill(self,
                    session,
                    chamber,
                    bill_id,
                    title,
                    url,
                    strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub):

        html = self.get(url).text

        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        bill_type = self.bill_types[bill_id.split()[0][1:]]

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)

        xpath = ('//strong[contains(., "SUBJECT")]/../'
                 'following-sibling::td/a/text()')
        bill['subjects'] = page.xpath(xpath)

        for version in self.scrape_versions(session, chamber, page, bill_id):
            bill.add_version(**version)

        # Resolution pages have different html.
        values = {}
        trs = page.xpath('//div[@id="bhistcontent"]/table/tr')
        for tr in trs:
            heading = tr.xpath('td/strong/text()')
            if heading:
                heading = heading[0]
            else:
                continue
            value = tr.text_content().replace(heading, '').strip()
            values[heading] = value

        # summary was always same as title
        #bill['summary'] = values['SUMMARY:']

        # Add primary sponsor.
        primary = strip_sponsors('', values.get('LEAD SPONSOR:', ''))
        if primary:
            bill.add_sponsor('primary', primary)

        # Add cosponsors.
        if values.get('SPONSORS:'):
            sponsors = strip_sponsors('', values['SPONSORS:'])
            sponsors = re.split(', (?![A-Z]\.)', sponsors)
            for name in sponsors:
                name = name.strip(', \n\r')
                if name:
                    # Fix name splitting bug where "Neale, D. Hall"
                    match = re.search('(.+?), ([DM]\. Hall)', name)
                    if match:
                        for name in match.groups():
                            bill.add_sponsor('cosponsor', name)
                    else:
                        bill.add_sponsor('cosponsor', name)

        for link in page.xpath("//a[contains(@href, 'votes/house')]"):
            self.scrape_house_vote(bill, link.attrib['href'])

        for tr in reversed(
                page.xpath("//table[@class='tabborder']/descendant::tr")[1:]):
            tds = tr.xpath('td')
            if len(tds) < 3:
                continue

            chamber_letter = tds[0].text_content()
            chamber = {'S': 'upper', 'H': 'lower'}[chamber_letter]

            # Index of date info no longer varies on resolutions.
            date = tds[2].text_content().strip()
            date = datetime.datetime.strptime(date, "%m/%d/%y").date()

            action = tds[1].text_content().strip()
            if action.lower().startswith('passed senate'):
                for href in tds[1].xpath('a/@href'):
                    self.scrape_senate_vote(bill, href, date)

            attrs = dict(actor=chamber, action=action, date=date)
            attrs.update(self.categorizer.categorize(action))
            bill.add_action(**attrs)

        self.save_bill(bill)
Exemplo n.º 56
0
    def scrape_bill(self, bill_url, chamber, session):
        with self.urlopen(bill_url) as text:
            if "Specified Bill could not be found" in text:
                return False
            page = lxml.html.fromstring(text)
            page.make_links_absolute(bill_url)

            bill_id = page.xpath("string(//h2)").split()[0]

            summary = page.xpath(
                "string(//*[starts-with(text(), 'Summary: ')])")
            summary = summary.replace('Summary: ', '')

            match = re.match(
                r"^([^:]+): "
                r"((\(Constitutional [aA]mendment\) )?[^(]+)", summary)

            if match:
                subjects = [match.group(1).strip()]
                title = match.group(2).strip()
            else:
                raise ScrapeError("Bad title")

            if bill_id.startswith('SB') or bill_id.startswith('HB'):
                bill_type = ['bill']
            elif bill_id.startswith('SR') or bill_id.startswith('HR'):
                bill_type = ['resolution']
            elif bill_id.startswith('SCR') or bill_id.startswith('HCR'):
                bill_type = ['concurrent resolution']
            else:
                raise ScrapeError("Invalid bill ID format: %s" % bill_id)

            if title.startswith("(Constitutional Amendment)"):
                bill_type.append('constitutional amendment')
                title = title.replace('(Constitutional Amendment) ', '')

            bill = Bill(session,
                        chamber,
                        bill_id,
                        title,
                        subjects=subjects,
                        type=bill_type)
            bill.add_source(bill_url)

            history_link = page.xpath("//a[text() = 'History']")[0]
            history_url = history_link.attrib['href']
            self.scrape_history(bill, history_url)

            authors_link = page.xpath("//a[text() = 'Authors']")[0]
            authors_url = authors_link.attrib['href']
            self.scrape_authors(bill, authors_url)

            try:
                versions_link = page.xpath(
                    "//a[text() = 'Text - All Versions']")[0]
                versions_url = versions_link.attrib['href']
                self.scrape_versions(bill, versions_url)
                for doc in ["Notes", "Digest", "Amendments", "Misc"]:
                    doc_link = page.xpath("//a[text() = '%s']" % doc)[0]
                    doc_url = doc_link.attrib['href']
                    self.scrape_documents(bill, doc_url)
            except IndexError:
                # Only current version
                try:
                    version_link = page.xpath(
                        "//a[text() = 'Text - Current']")[0]
                    version_url = version_link.attrib['href']
                    bill.add_version("%s Current" % bill_id,
                                     version_url,
                                     on_duplicate="use_old")
                except IndexError:
                    # Some bills don't have any versions :(
                    pass

            try:
                votes_link = page.xpath("//a[text() = 'Votes']")[0]
                self.scrape_votes(bill, votes_link.attrib['href'])
            except IndexError:
                # Some bills don't have any votes
                pass

            self.save_bill(bill)

            return True
Exemplo n.º 57
0
    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        if chamber == 'legislature':
            chamber = 'upper'
        bill = Bill(data['legislative_session'],
                    chamber,
                    data['identifier'],
                    data['title'],
                    subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(
                action['organization_id'])['classification']
            legislators = []
            committees = []
            for rel in action['related_entities']:
                if rel['entity_type'] == 'organization':
                    committees.append(rel['name'])
                elif rel['entity_type'] == 'person':
                    legislators.append(rel['name'])
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']),
                            committees=committees,
                            legislators=legislators,
                            **action.get('extras', {}))

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(
                sponsor['classification'],
                sponsor['name'],
            )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'],
                                 link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']),
                                 **version.get('extras', {}))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'],
                                  link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']),
                                  **doc.get('extras', {}))

        for title in data['other_titles']:
            bill.add_title(title['title'])

        for related in data['related_bills']:
            bill.add_companion(related['identifier'],
                               related['legislative_session'], chamber)

        bill['alternate_bill_ids'] = [
            oi['identifier'] for oi in data['other_identifiers']
        ]
        self.save_bill(bill)
Exemplo n.º 58
0
    def parse_bill(self, chamber, session, bill_id, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            try:
                short_bill_id = re.sub(r'S([JC])R', r'S\1', bill_id)

                version_link = page.xpath(
                    "//a[contains(@href, '%s/bill.doc')]" % short_bill_id)[0]
            except IndexError:
                # Bill withdrawn
                return

            pars = version_link.xpath("following-sibling::p")
            if len(pars) == 2:
                title = pars[0].xpath("string()")
                action_p = pars[1]
            else:
                title = pars[0].getprevious().tail
                action_p = pars[0]

            title = re.sub(ur'[\s\xa0]+', ' ', title).strip()

            if 'CR' in bill_id:
                bill_type = 'concurrent resolution'
            elif 'JR' in bill_id:
                bill_type = 'joint resolution'
            elif 'R' in bill_id:
                bill_type = 'resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill['subjects'] = self._subjects[bill_id]
            bill.add_source(url)

            bill.add_version("Most Recent Version",
                             version_link.attrib['href'])

            for link in page.xpath("//a[contains(@href, 'legislator/')]"):
                bill.add_sponsor('primary', link.text.strip())

            for line in action_p.xpath("string()").split("\n"):
                action = line.strip()
                if (not action or action == 'last action' or
                    'Prefiled' in action):
                    continue

                action_date = "%s %s" % (action.split('-')[0],
                                         session[0:4])
                action_date = datetime.datetime.strptime(
                    action_date, '%b %d %Y')

                action = '-'.join(action.split('-')[1:])

                if action.endswith('House') or action.endswith('(H)'):
                    actor = 'lower'
                elif action.endswith('Senate') or action.endswith('(S)'):
                    actor = 'upper'
                else:
                    actor = chamber

                atype = []
                if action.startswith('introduced in'):
                    atype.append('bill:introduced')
                    if '; to ' in action:
                        atype.append('committee:referred')
                elif action.startswith('signed by Governor'):
                    atype.append('governor:signed')
                elif re.match(r'^to [A-Z]', action):
                    atype.append('committee:referred')
                elif action == 'adopted by voice vote':
                    atype.append('bill:passed')

                if '1st reading' in action:
                    atype.append('bill:reading:1')
                if '3rd reading' in action:
                    atype.append('bill:reading:3')
                if '2nd reading' in action:
                    atype.append('bill:reading:2')

                if 'R' in bill_id and 'adopted by voice vote' in action:
                    atype.append('bill:passed')

                amendment_re = (r'floor amendments?( \([a-z\d\-]+\))*'
                                r'( and \([a-z\d\-]+\))? filed')
                if re.search(amendment_re, action):
                    atype.append('amendment:introduced')

                if not atype:
                    atype = ['other']

                bill.add_action(actor, action, action_date, type=atype)

            try:
                votes_link = page.xpath(
                    "//a[contains(@href, 'vote_history.pdf')]")[0]
                bill.add_document("Vote History",
                                  votes_link.attrib['href'])
            except IndexError:
                # No votes
                pass

            self.save_bill(bill)
Exemplo n.º 59
0
    def parse_bill_xml(self, chamber, session, txt):
        root = lxml.etree.fromstring(txt.bytes)
        bill_id = ' '.join(root.attrib['bill'].split(' ')[1:])
        bill_title = root.findtext("caption")

        if session[2] == 'R':
            session = session[0:2]

        if bill_id[1] == 'B':
            bill_type = ['bill']
        elif bill_id[1] == 'R':
            bill_type = ['resolution']
        elif bill_id[1:3] == 'CR':
            bill_type = ['concurrent resolution']
        elif bill_id[1:3] == 'JR':
            bill_type = ['joint resolution']
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)

        versions = root.xpath("//versions")
        for version in versions:
            versionz = version.xpath(".//version")
            for v in versionz:
                description = v.xpath(".//versionDescription")[0].text
                html_url = v.xpath(".//WebHTMLURL")[0].text
                bill.add_version(description, html_url, 'text/html')

        for action in root.findall('actions/action'):
            act_date = datetime.datetime.strptime(action.findtext('date'),
                                                  "%m/%d/%Y").date()

            extra = {}
            extra['action_number'] = action.find('actionNumber').text
            comment = action.find('comment')
            if comment is not None and comment.text:
                extra['comment'] = comment.text.strip()

            actor = {
                'H': 'lower',
                'S': 'upper',
                'E': 'executive'
            }[extra['action_number'][0]]

            desc = action.findtext('description').strip()

            if desc == 'Scheduled for public hearing on . . .':
                continue

            introduced = False

            if desc == 'Amended':
                atype = 'amendment:passed'
            elif desc == 'Amendment(s) offered':
                atype = 'amendment:introduced'
            elif desc == 'Amendment amended':
                atype = 'amendment:amended'
            elif desc == 'Amendment withdrawn':
                atype = 'amendment:withdrawn'
            elif desc == 'Passed' or desc == 'Adopted':
                atype = 'bill:passed'
            elif re.match(r'^Received (by|from) the', desc):
                if 'Secretary of the Senate' not in desc:
                    atype = 'bill:introduced'
                else:
                    atype = 'bill:filed'
            elif desc.startswith('Sent to the Governor'):
                # But what if it gets lost in the mail?
                atype = 'governor:received'
            elif desc.startswith('Signed by the Governor'):
                atype = 'governor:signed'
            elif desc == 'Vetoed by the Governor':
                atype = 'governor:vetoed'
            elif desc == 'Read first time':
                atype = ['bill:introduced', 'bill:reading:1']
                introduced = True
            elif desc == 'Read & adopted':
                atype = ['bill:passed']
                if not introduced:
                    introduced = True
                    atype.append('bill:introduced')
            elif desc == "Passed as amended":
                atype = 'bill:passed'
            elif desc.startswith('Referred to') or desc.startswith(
                    "Recommended to be sent to "):
                atype = 'committee:referred'
            elif desc == "Reported favorably w/o amendment(s)":
                atype = 'committee:passed'
            elif desc == "Filed":
                atype = 'bill:filed'
            elif desc == 'Read 3rd time':
                atype = 'bill:reading:3'
            elif desc == 'Read 2nd time':
                atype = 'bill:reading:2'
            elif desc.startswith('Reported favorably'):
                atype = 'committee:passed:favorable'
            else:
                atype = 'other'

            if 'committee:referred' in atype:
                repls = ['Referred to', "Recommended to be sent to "]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                extra['committee'] = ctty

            bill.add_action(actor,
                            action.findtext('description'),
                            act_date,
                            type=atype,
                            **extra)

        for author in root.findtext('authors').split(' | '):
            if author != "":
                bill.add_sponsor('author', author)
        for coauthor in root.findtext('coauthors').split(' | '):
            if coauthor != "":
                bill.add_sponsor('coauthor', coauthor)
        for sponsor in root.findtext('sponsors').split(' | '):
            if sponsor != "":
                bill.add_sponsor('sponsor', sponsor)
        for cosponsor in root.findtext('cosponsors').split(' | '):
            if cosponsor != "":
                bill.add_sponsor('cosponsor', cosponsor)

        bill['subjects'] = []
        for subject in root.iterfind('subjects/subject'):
            bill['subjects'].append(subject.text.strip())

        return bill
Exemplo n.º 60
0
    def scrape_bill_sheet(self, session, chamber):
        """
        Scrape the bill sheet (the page full of bills and other small bits of data)
        """
        sheet_url = self.get_bill_folder(session, chamber)

        bill_chamber = {"Senate": "upper", "House": "lower"}[chamber]

        index = {
            "id": 0,
            "title_sponsor": 1,
            "version": 2,
            "history": 3,
            "votes": 7
        }

        with self.urlopen(sheet_url) as sheet_html:
            sheet_page = lxml.html.fromstring(sheet_html)

            bills = sheet_page.xpath('//table/tr')

            for bill in bills:
                bill_id = self.read_td(bill[index["id"]][0])

                if bill_id == None:
                    # Every other entry is null for some reason
                    continue

                dot_loc = bill_id.find('.')
                if dot_loc != -1:
                    # budget bills are missing the .pdf, don't truncate
                    bill_id = bill_id[:dot_loc]
                title_and_sponsor = bill[index["title_sponsor"]][0]

                bill_title = title_and_sponsor.text
                bill_title_and_sponsor = title_and_sponsor.text_content()
                sponsors = bill_title_and_sponsor.replace(bill_title, "").\
                    replace(" & ...", "").split("--")

                cats = {
                    "SB": "bill",
                    "HB": "bill",
                    "HR": "resolution",
                    "SR": "resolution",
                    "SCR": "concurrent resolution",
                    "HCR": "concurrent resolution",
                    "SJR": "joint resolution",
                    "HJR": "joint resolution",
                    "SM": "memorial",
                    "HM": "memorial"
                }

                bill_type = None

                for cat in cats:
                    if bill_id[:len(cat)] == cat:
                        bill_type = cats[cat]

                b = Bill(session,
                         bill_chamber,
                         bill_id,
                         bill_title,
                         type=bill_type)

                b.add_source(sheet_url)

                versions_url = \
                    bill[index["version"]].xpath('font/a')[0].attrib["href"]
                versions_url = CO_URL_BASE + versions_url
                versions = self.parse_versions(versions_url)
                for version in versions:
                    b.add_version(version['name'],
                                  version['link'],
                                  mimetype=version['mimetype'])

                bill_history_href = CO_URL_BASE + \
                    bill[index["history"]][0][0].attrib['href']
                # ^^^^^^^ We assume this is a full path to the target.
                # might want to consider some better rel-path support
                # XXX: Look at this ^

                history = self.parse_history(bill_history_href)
                b.add_source(bill_history_href)

                for action in history:
                    self.add_action_to_bill(b, action)

                for sponsor in sponsors:
                    if sponsor != None and sponsor != "(NONE)" and \
                       sponsor != "":
                        b.add_sponsor("primary", sponsor)

                # Now that we have history, let's see if we can't grab some
                # votes

                bill_vote_href = self.get_vote_url(bill_id, session)
                votes = self.parse_votes(bill_vote_href)

                if votes['sanity-check'] != bill_id:
                    self.warning("XXX: READ ME! Sanity check failed!")
                    self.warning(" -> Scraped ID: " + votes['sanity-check'])
                    self.warning(" -> 'Real' ID:  " + bill_id)
                    assert votes['sanity-check'] == bill_id

                for vote in votes['votes']:
                    filed_votes = vote['votes']
                    passage = vote['meta']
                    result = vote['result']

                    composite_time = "%s %s" % (passage['x-parent-date'],
                                                passage['TIME'])
                    # It's now like: 04/01/2011 02:10:14 PM
                    pydate = dt.datetime.strptime(composite_time,
                                                  "%m/%d/%Y %I:%M:%S %p")
                    hasHouse = "House" in passage['x-parent-ctty']
                    hasSenate = "Senate" in passage['x-parent-ctty']

                    if hasHouse and hasSenate:
                        actor = "joint"
                    elif hasHouse:
                        actor = "lower"
                    else:
                        actor = "upper"

                    other = (int(result['EXC']) + int(result['ABS']))
                    # OK, sometimes the Other count is wrong.
                    local_other = 0
                    for voter in filed_votes:
                        l_vote = filed_votes[voter].lower().strip()
                        if l_vote != "yes" and l_vote != "no":
                            local_other = local_other + 1

                    if local_other != other:
                        self.warning( \
                            "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES" )
                        self.warning(" -> Old: %s // New: %s" %
                                     (other, local_other))
                        other = local_other

                    v = Vote(actor,
                             pydate,
                             passage['MOTION'],
                             (result['FINAL_ACTION'] == "PASS"),
                             int(result['YES']),
                             int(result['NO']),
                             other,
                             moved=passage['MOVED'],
                             seconded=passage['SECONDED'])

                    v.add_source(vote['meta']['url'])
                    # v.add_source( bill_vote_href )

                    # XXX: Add more stuff to kwargs, we have a ton of data
                    for voter in filed_votes:
                        who = voter
                        vote = filed_votes[who]
                        if vote.lower() == "yes":
                            v.yes(who)
                        elif vote.lower() == "no":
                            v.no(who)
                        else:
                            v.other(who)
                    b.add_vote(v)
                self.save_bill(b)