Exemplos de Bill.Bill em Python, exemplos de billy.scrape.bills.Bill.Bill em Python

Exemplo n.º 1

0

Exibir arquivo

    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = self.urlopen(url)
        except scrapelib.HTTPError:
            self.warning("couldn't open %s, skipping bill" % url)
            return
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        header = page.xpath('//h3/br')[0].tail.replace('&nbsp;', ' ')
        title, primary_sponsor = header.split(' -- ')

        if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'):
            bill_type = ['bill']
        elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'):
            bill_type = ['resolution']
        elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'):
            bill_type = ['concurrent resolution']
        elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'):
            bill_type = ['joint resolution']

        for flag in SUB_BLACKLIST:
            if flag in bill_id:
                bill_id = bill_id.replace(flag, " ")
        bill_id = re.sub("\s+", " ", bill_id).strip()

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_sponsor('primary', primary_sponsor)
        bill.add_source(url)

        for link in page.xpath(
            '//a[contains(@href, "bills/") and text() = "HTML"]'):

            name = link.getprevious().tail.strip()
            bill.add_version(name, link.attrib['href'], mimetype="text/html")
            next = link.getnext()
            if next.text == "PDF":
                bill.add_version(name, next.attrib['href'],
                                 mimetype="application/pdf")

        for link in page.xpath(
            "//a[contains(@href, 'fnotes') and text() = 'HTML']"):

            bill.add_document("Fiscal Note", link.attrib['href'])

        subjects = []
        for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
            subjects.append(link.text.strip())
        bill['subjects'] = subjects

        status_link = page.xpath('//a[contains(@href, "billsta")]')[0]
        self.parse_status(bill, status_link.attrib['href'])

        self.save_bill(bill)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: bills.py Projeto: rhouse2/openstates

    def get_bill_info(self, chamber, session, bill_detail_url,
                      version_list_url):
        """Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        if chamber == "House":
            chamber = 'lower'
        else:
            chamber = 'upper'

        with self.urlopen(bill_detail_url) as bill_html:
            doc = lxml.html.fromstring(bill_html)

            bill_id = doc.xpath('//title/text()')[0].split()[0]
            bill_title = doc.xpath('//font[@size=-1]/text()')[0]
            bill_type = {
                'F': 'bill',
                'R': 'resolution',
                'C': 'concurrent resolution'
            }[bill_id[1]]
            bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)
            bill['subjects'] = self._subject_mapping[bill_id]
            bill.add_source(bill_detail_url)

            # grab sponsors
            sponsors = doc.xpath(
                '//table[@summary="Show Authors"]/descendant::a/text()')
            if sponsors:
                primary_sponsor = sponsors[0].strip()
                bill.add_sponsor('primary', primary_sponsor)
                cosponsors = sponsors[1:]
                for leg in cosponsors:
                    bill.add_sponsor('cosponsor', leg.strip())

            # Add Actions performed on the bill.
            bill_actions = self.extract_bill_actions(doc, chamber)
            for action in bill_actions:
                bill.add_action(action['action_chamber'],
                                action['action_text'],
                                action['action_date'],
                                type=action['action_type'])

        # Get all versions of the bill.
        # Versions of a bill are on a separate page, linked to from the column
        # labeled, "Bill Text", on the search results page.
        with self.urlopen(version_list_url) as version_html:
            version_doc = lxml.html.fromstring(version_html)
            for v in version_doc.xpath(
                    '//a[starts-with(@href, "/bin/getbill.php")]'):
                version_url = urlparse.urljoin(VERSION_URL_BASE, v.get('href'))
                bill.add_version(v.text.strip(), version_url)

        self.save_bill(bill)

Exemplo n.º 3

0

Exibir arquivo

    def scrape_bill(self, session, chamber, bill_type, bill_url):
        with self.urlopen(bill_url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(bill_url)

            # split "SB1 SD2 HD2" to get SB1
            bill_id = page.xpath('//a[@id="LinkButtonMeasure"]')[0].text_content().split()[0]

            title = page.xpath('//span[@id="ListView1_ctrl0_measure_titleLabel"]')[0].text
            subjects = page.xpath('//span[@id="ListView1_ctrl0_report_titleLabel"]')[0].text.split('; ')
            subjects = [s.strip() for s in subjects if s.strip()]
            description = page.xpath('//span[@id="ListView1_ctrl0_descriptionLabel"]')[0].text
            sponsors = page.xpath('//span[@id="ListView1_ctrl0_introducerLabel"]')[0].text
            referral = page.xpath('//span[contains(@id, "referral")]/text()')[0]

            bill = Bill(session, chamber, bill_id, title, subjects=subjects,
                        type=bill_type, description=description, referral=referral)
            for sponsor in sponsors.split(', '):
                if sponsor.endswith(' (BR)'):
                    sponsor = sponsor[:-5]
                bill.add_sponsor('primary', sponsor)

            # actions
            actions = []

            table = page.xpath('//table[@id="GridViewStatus"]')[0]
            for row in table.xpath('tr'):
                action_params = {}
                cells = row.xpath('td')
                if len(cells) == 3:
                    ch = cells[1].xpath('font')[0].text
                    action_params['actor'] = house[ch]
                    action_params['action'] = cells[2].xpath('font')[0].text
                    action_date = cells[0].xpath('font')[0].text
                    action_params['date'] = datetime.strptime(action_date, "%m/%d/%Y")
                    action_params['type'] = categorize_action(action_params['action'])
                    actions.append(action_params)
            for action_params in actions:
                bill.add_action(**action_params)

                self.parse_vote(bill, action_params['action'],
                                action_params['actor'], action_params['date'])

            # add versions
            try:
                for version in page.xpath('//a[contains(@id, "StatusLink")]'):
                    bill.add_version(version.text.replace('_', ' '),
                                     version.get('href'))
            except IndexError: # href not found.
                pass

        bill.add_source(bill_url)
        self.save_bill(bill)

Exemplo n.º 4

0

Exibir arquivo

    def parse_senate_billpage(self, bill_url, year):
        bill_page = self.urlopen(bill_url)
        bill_page = lxml.html.fromstring(bill_page)
        # get all the info needed to record the bill
        # TODO probably still needs to be fixed
        bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content()
        bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content()
        bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content()
        bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()
        #print "bill id = "+ bill_id

        bill_type = "bill"
        triplet = bill_id[:3]
        if triplet in bill_types:
            bill_type = bill_types[triplet]

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self.subjects:
            subs = self.subjects[bid]
            self.log("With subjects for this bill")

        self.log(bid)

        bill = Bill(year, 'upper', bill_id, bill_desc,
                    bill_lr=bill_lr, type=bill_type, subjects=subs)
        bill.add_source(bill_url)

        # Get the primary sponsor
        sponsor = bill_page.xpath('//*[@id="hlSponsor"]')[0]
        bill_sponsor = sponsor.text_content()
        bill_sponsor_link = sponsor.attrib.get('href')
        bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link)

        # cosponsors show up on their own page, if they exist
        cosponsor_tag = bill_page.xpath('//*[@id="hlCoSponsors"]')
        if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.has_key('href'):
            self.parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href'])

        # get the actions
        action_url = bill_page.xpath('//*[@id="hlAllActions"]')
        if len(action_url) > 0:
            action_url =  action_url[0].attrib['href']
            #print "actions = %s" % action_url
            self.parse_senate_actions(bill, action_url)

        # stored on a separate page
        versions_url = bill_page.xpath('//*[@id="hlFullBillText"]')
        if len(versions_url) > 0 and versions_url[0].attrib.has_key('href'):
            self.parse_senate_bill_versions(bill, versions_url[0].attrib['href'])

        self.save_bill(bill)

Exemplo n.º 5

0

Exibir arquivo

    def parse_senate_billpage(self, bill_url, year):
        with self.urlopen(bill_url) as bill_page:
            bill_page = lxml.html.fromstring(bill_page)
            # get all the info needed to record the bill
            # TODO probably still needs to be fixed
            bill_id = bill_page.xpath(
                '//*[@id="lblBillNum"]')[0].text_content()
            bill_title = bill_page.xpath(
                '//*[@id="lblBillTitle"]')[0].text_content()
            bill_desc = bill_page.xpath(
                '//*[@id="lblBriefDesc"]')[0].text_content()
            bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()
            #print "bill id = "+ bill_id

            bill = Bill(year,
                        'upper',
                        bill_id,
                        bill_desc,
                        bill_url=bill_url,
                        bill_lr=bill_lr,
                        official_title=bill_title)
            bill.add_source(bill_url)

            # Get the primary sponsor
            sponsor = bill_page.xpath('//*[@id="hlSponsor"]')[0]
            bill_sponsor = sponsor.text_content()
            bill_sponsor_link = sponsor.attrib.get('href')
            bill.add_sponsor('primary',
                             bill_sponsor,
                             sponsor_link=bill_sponsor_link)

            # cosponsors show up on their own page, if they exist
            cosponsor_tag = bill_page.xpath('//*[@id="hlCoSponsors"]')
            if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.has_key(
                    'href'):
                self.parse_senate_cosponsors(bill,
                                             cosponsor_tag[0].attrib['href'])

            # get the actions
            action_url = bill_page.xpath('//*[@id="hlAllActions"]')
            if len(action_url) > 0:
                action_url = action_url[0].attrib['href']
                #print "actions = %s" % action_url
                self.parse_senate_actions(bill, action_url)

            # stored on a separate page
            versions_url = bill_page.xpath('//*[@id="hlFullBillText"]')
            if len(versions_url) > 0 and versions_url[0].attrib.has_key(
                    'href'):
                self.parse_senate_bill_versions(bill,
                                                versions_url[0].attrib['href'])

        self.save_bill(bill)

Exemplo n.º 6

0

Exibir arquivo

def test_on_duplicate():
    b = Bill('S1', 'upper', 'SB1', 'on_duplicate')
    b.add_version('current', 'http://example.com/doc/1', mimetype='text/html')

    # error
    with assert_raises(ValueError):
        b.add_version('current',
                      'http://example.com/doc/1',
                      mimetype='text/html',
                      on_duplicate='error')

    # or without it set, default to error
    with assert_raises(ValueError):
        b.add_version('current',
                      'http://example.com/doc/1',
                      mimetype='text/html')

    # use_old - keep version name the same
    b.add_version('updated name',
                  'http://example.com/doc/1',
                  mimetype='text/html',
                  on_duplicate='use_old')
    assert_equal(b['versions'], [{
        'mimetype': 'text/html',
        'url': 'http://example.com/doc/1',
        'name': 'current'
    }])

    # use_new - keep version name the same
    b.add_version('updated name',
                  'http://example.com/doc/1',
                  mimetype='text/html',
                  on_duplicate='use_new')
    assert_equal(b['versions'], [{
        'mimetype': 'text/html',
        'url': 'http://example.com/doc/1',
        'name': 'updated name'
    }])

    # a new document w/ same name is ok though
    b.add_version('updated name',
                  'http://example.com/doc/2',
                  mimetype='text/html',
                  on_duplicate='use_old')
    assert len(b['versions']) == 2

    # and now we add a duplicate
    b.add_version('current',
                  'http://example.com/doc/1',
                  mimetype='text/html',
                  on_duplicate='ignore')
    assert len(b['versions']) == 3

Exemplo n.º 7

0

Exibir arquivo

    def scrape_bill(self, bill_page_url):
        bill_page = lxml.html.fromstring(self.get(bill_page_url).text)

        title = bill_page.xpath(
            '//span[@id="ctl00_ContentPlaceHolder_SubjectLabel"]/text()')
        if title:
            title = title[0]
        else:
            self.warning('Missing bill title {}'.format(bill_page_url))
            return False

        bill_no = bill_page.xpath(
            '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/a/text()')
        if bill_no:
            bill_no = bill_no[0]
        else:
            bill_no = bill_page.xpath(
                '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/text()'
            )
            if bill_no:
                bill_no = bill_no[0]
            else:
                self.error('Missing bill number {}'.format(bill_page_url))
                return False

        bill = Bill(session=self.session,
                    chamber='upper',
                    bill_id=bill_no,
                    title=title,
                    type='bill')

        bill.add_source(bill_page_url)

        self.parse_versions(bill, bill_page, bill_no)

        self.parse_acts(bill, bill_page)

        sponsors = bill_page.xpath(
            '//span[@id="ctl00_ContentPlaceHolder_SponsorsLabel"]/text()')
        if sponsors:
            self.assign_sponsors(bill, sponsors[0], 'primary')

        cosponsors = bill_page.xpath(
            '//span[@id="ctl00_ContentPlaceHolder_CoSponsorsLabel"]/text()')
        if cosponsors:
            self.assign_sponsors(bill, cosponsors[0], 'cosponsor')

        self.parse_date_actions(bill, bill_page)

        self.parse_actions(bill, bill_page)

        self.save_bill(bill)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: bills.py Projeto: walker/openstates

    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            # search for Titulo, accent over i messes up lxml, so use 'tulo'
            title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()')
            if not title:
                raise NoSuchBill()
            bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
            author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0]
            for aname in author.split(','):
                bill.add_sponsor('primary', self.clean_name(aname).strip())
            co_authors = doc.xpath(u'//td/b[contains(text(),"Co-autor")]/../text()')
            if len(co_authors) != 0:
                for co_author in co_authors[1].split(','):
                    bill.add_sponsor('cosponsor', self.clean_name(co_author).strip());
            action_table = doc.xpath('//table')[-1]
            for row in action_table[1:]:
                tds = row.xpath('td')
                # ignore row missing date
                if len(tds) != 2:
                    continue
                date = datetime.datetime.strptime(tds[0].text_content(),
                                                  "%m/%d/%Y")
                action = tds[1].text_content().strip()
                #parse the text to see if it's a new version or a unrelated document
                #if has - let's *shrug* assume it's a vote document

                #get url of action
                action_url = tds[1].xpath('a/@href')
                atype,action = self.parse_action(chamber,bill,action,action_url,date)
                if atype == 'bill:passed' and action_url:
                    vote_chamber  = None
                    for pattern, vote_chamber in _voteChambers:
                       if re.match(pattern,action):
                           break

                    else:
                       self.warning('coudnt find voteChamber pattern')

                    if vote_chamber == 'lower' and len(action_url) > 0:
                        vote = self.scrape_votes(action_url[0], action,date,
                                                 vote_chamber)
                        if not vote[0] == None:
                            vote[0].add_source(action_url[0])
                            bill.add_vote(vote[0])
                        else:
                            self.warning('Problem Reading vote: %s,%s' %
                                         (vote[1], bill_id))

            bill.add_source(url)
            self.save_bill(bill)

Exemplo n.º 9

0

Exibir arquivo

    def scrape_bill(self, session, chamber, bill_id, short_title, url):
        if bill_id in ['SCR 0003', 'SB 0251', 'SB 0292']:
            return

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath("//br")[8].tail
            if not title:
                title = short_title
            title = title.strip()

            abbrev = bill_id.split()[0]
            if abbrev.endswith('B'):
                bill_type = ['bill']
            elif abbrev.endswith('JR'):
                bill_type = ['joint resolution']
            elif abbrev.endswith('CR'):
                bill_type = ['concurrent resolution']
            elif abbrev.endswith('R'):
                bill_type = ['resolution']

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill.add_source(url)

            action_link = page.xpath("//a[contains(@href, 'getActions')]")[0]
            self.scrape_actions(bill, action_link.attrib['href'])

            version_path = "//a[contains(., '%s')]"
            for version_type in ('Introduced Bill', 'House Bill',
                                 'Senate Bill', 'Engrossed Bill',
                                 'Enrolled Act'):
                path = version_path % version_type
                links = page.xpath(path)
                if links:
                    bill.add_version(version_type, links[0].attrib['href'])

            for vote_link in page.xpath("//a[contains(@href, 'Srollcal')]"):
                self.scrape_senate_vote(bill, vote_link.attrib['href'])

            for vote_link in page.xpath("//a[contains(@href, 'Hrollcal')]"):
                self.scrape_house_vote(bill, vote_link.attrib['href'])

            for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"):
                num = doc_link.text.strip().split("(")[0]
                bill.add_document("Fiscal Impact Statement #%s" % num,
                                  doc_link.attrib['href'])

            bill['subjects'] = self.subjects[bill_id]

            self.save_bill(bill)

Exemplo n.º 10

0

Exibir arquivo

    def scrape_bill(self, chamber, session, bill_id):
        biennium = "%s-%s" % (session[0:4], session[7:9])
        bill_num = bill_id.split()[1]

        url = ("%s/GetLegislation?biennium=%s&billNumber"
               "=%s" % (self._base_url, biennium, bill_num))

        page = self.urlopen(url)
        page = lxml.etree.fromstring(page.bytes)
        page = xpath(page, "//wa:Legislation")[0]

        title = xpath(page, "string(wa:LongDescription)")

        bill_type = xpath(
            page, "string(wa:ShortLegislationType/wa:LongLegislationType)")
        bill_type = bill_type.lower()

        if bill_type == 'gubernatorial appointment':
            return

        bill = Bill(session, chamber, bill_id, title, type=[bill_type])

        fake_source = ("http://apps.leg.wa.gov/billinfo/"
                       "summary.aspx?bill=%s&year=%s" %
                       (bill_num, session[0:4]))
        bill.add_source(fake_source)

        chamber_name = {'lower': 'House', 'upper': 'Senate'}[chamber]
        mimetype = 'text/html'
        version_url = ("http://www.leg.wa.gov/pub/billinfo/%s/"
                       "Htm/Bills/%s %ss/%s.htm" %
                       (biennium, chamber_name, bill_type.title(), bill_num))

        # Sometimes the measure's version_url isn't guessable. When that happens
        # have to get the url from the source page.
        version_resp = self.get(version_url)
        if version_resp.status_code != 200:
            webpage = self.get(fake_source).text
            webdoc = lxml.html.fromstring(webpage)
            version_url = webdoc.xpath(
                '//a[contains(@href, "billdocs")]/@href')[-1]
            if version_url.lower().endswith('.pdf'):
                mimetype = 'application/pdf'

        bill.add_version(bill_id, version_url, mimetype=mimetype)

        self.scrape_sponsors(bill)
        self.scrape_actions(bill, bill_num)
        self.scrape_votes(bill)
        self.fix_prefiled_action_dates(bill)

        return bill

Exemplo n.º 11

0

Exibir arquivo

Arquivo: models.py Projeto: walker/openstates

    def __init__(self, scraper, session, chamber, url, doc, bill_type, bill_id,
                 title, bill_id_parts):
        self.scraper = scraper
        self.chamber = chamber
        self.url = url
        self.doc = doc
        self.bill_id = bill_id
        self.letter, self.number, self.version = bill_id_parts
        self.data = {}
        self.bill = Bill(session, chamber, bill_id, title, type=bill_type)
        self.succeeded = False

        self._build()

Exemplo n.º 12

0

Exibir arquivo

    def scrape(self, chamber, session):
        try:
            for index in xrange(1, 1000):
                url = ("http://open.nysenate.gov/legislation/search/"
                       "?search=otype:bill&searchType=&format=xml"
                       "&pageIdx=%d" % index)
                with self.urlopen(url) as page:
                    page = lxml.etree.fromstring(page)

                    for result in page.xpath("//result[@type = 'bill']"):
                        bill_id = result.attrib['id'].split('-')[0]

                        title = result.attrib['title'].strip()
                        if title == '(no title)':
                            continue

                        primary_sponsor = result.attrib['sponsor']
                        primary_sponsor = re.sub(r'\s+\(MS\)\s*$', '',
                                                 primary_sponsor).strip()

                        bill_chamber, bill_type = {
                            'S': ('upper', 'bill'),
                            'R': ('upper', 'resolution'),
                            'J': ('upper', 'legislative resolution'),
                            'B': ('upper', 'concurrent resolution'),
                            'A': ('lower', 'bill'),
                            'E': ('lower', 'resolution'),
                            'K': ('lower', 'legislative resolution'),
                            'L': ('lower', 'joint resolution')
                        }[bill_id[0]]

                        if chamber != bill_chamber:
                            continue

                        bill = Bill(session,
                                    chamber,
                                    bill_id,
                                    title,
                                    type=bill_type)
                        bill.add_source(url)
                        bill.add_sponsor('primary', primary_sponsor)

                        bill_url = ("http://open.nysenate.gov/legislation/"
                                    "bill/%s" % result.attrib['id'])
                        self.scrape_bill(bill, bill_url)
                        bill.add_source(bill_url)

                        self.save_bill(bill)
        except scrapelib.HTTPError as e:
            if e.response.code != 404:
                raise

Exemplo n.º 13

0

Exibir arquivo

    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        bill = Bill(data['legislative_session'],
                    chamber,
                    data['identifier'],
                    data['title'],
                    subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(
                action['organization_id'])['classification']
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']))
            # TODO: related entities

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(
                sponsor['classification'],
                sponsor['name'],
            )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'],
                                 link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'],
                                  link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']))

        for title in data['other_titles']:
            bill.add_title(title)

        # TODO: related bills
        # for related in data['related_bills']:

        self.save_bill(bill)

Exemplo n.º 14

0

Exibir arquivo

    def scrape(self, chamber, session):
        self.user_agent = 'openstates +mozilla'
        # internal id for the session, store on self so all methods have access
        self.site_id = self.metadata['session_details'][session]['site_id']

        self.build_subject_map()

        # used for skipping bills from opposite chamber
        start_letter = 'H' if chamber == 'lower' else 'S'

        url = 'http://lis.virginia.gov/cgi-bin/legp604.exe?%s+lst+ALL' % self.site_id

        while url:
            with self.urlopen(url, retry_on_404=True) as html:
                doc = lxml.html.fromstring(html)

                url = None  # no more unless we encounter 'More...'

                bills = doc.xpath('//ul[@class="linkSect"]/li')
                for bill in bills:
                    link = bill.getchildren()[0]
                    bill_id = str(link.text_content())

                    # check if this is the 'More...' link
                    if bill_id.startswith('More'):
                        url = BASE_URL + link.get('href')

                    # skip bills from the other chamber
                    elif not bill_id.startswith(start_letter):
                        continue

                    else:
                        # create a bill
                        desc = bill.xpath('text()')[0].strip()
                        bill_type = {
                            'B': 'bill',
                            'J': 'joint resolution',
                            'R': 'resolution'
                        }[bill_id[1]]
                        bill = Bill(session,
                                    chamber,
                                    bill_id,
                                    desc,
                                    type=bill_type)

                        bill_url = BASE_URL + link.get('href')
                        self.fetch_sponsors(bill)
                        self.scrape_bill_details(bill_url, bill)
                        bill['subjects'] = self.subject_map[bill_id]
                        bill.add_source(bill_url)
                        self.save_bill(bill)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: bills.py Projeto: poliquin/openstates

    def get_bill_info(self, chamber, session, bill_detail_url,
                      version_list_url):
        """
        Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        chamber = 'lower' if chamber.lower() == 'house' else chamber
        chamber = 'upper' if chamber.lower() == 'senate' else chamber

        # Get html and parse
        bill_html = self.urlopen(bill_detail_url)
        doc = lxml.html.fromstring(bill_html)

        # Get the basic parts of the bill
        bill_id = doc.xpath('//h1/text()')[0]
        bill_title = doc.xpath('//h2/following-sibling::p/text()')[0].strip()
        bill_type = {
            'F': 'bill',
            'R': 'resolution',
            'C': 'concurrent resolution'
        }[bill_id[1]]
        bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)

        # Add source
        bill.add_source(bill_detail_url)

        # Add subjects.  Currently we are not mapping to Open States
        # standardized subjects, so use 'scraped_subjects'
        bill['scraped_subjects'] = self._subject_mapping[bill_id]

        # Get companion bill.
        companion = doc.xpath(
            '//table[@class="status_info"]//tr[1]/td[2]/a[starts-with(@href, "?")]/text()'
        )
        companion = self.make_bill_id(
            companion[0]) if len(companion) > 0 else None
        companion_chamber = self.chamber_from_bill(companion)
        if companion is not None:
            bill.add_companion(companion, chamber=companion_chamber)

        # Grab sponsors
        bill = self.extract_sponsors(bill, doc, chamber)

        # Add Actions performed on the bill.
        bill = self.extract_actions(bill, doc, chamber)

        # Get all versions of the bill.
        bill = self.extract_versions(bill, doc, chamber, version_list_url)

        self.save_bill(bill)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: bills.py Projeto: rhouse2/openstates

 def scrape_bill_status_page(self, url, params={}):
     """Scrapes the status page url, populating parameter dict and 
     returns bill
     """
     with self.urlopen(url) as page:
         page = lxml.html.fromstring(page)
         page.make_links_absolute(url)
         params['bill_id'] = page.xpath(
             '//h3[contains(@class, "center")]/a')[0].text
         params['title'] = page.xpath('//div[div[contains( \
             ., "Report Title")]]/div[contains(@class, "rightside")]'
                                      )[0].text.strip()
         sponsors = page.xpath('//div[div[contains( \
             ., "Introducer")]]/div[contains(@class, "rightside")]')[0].text
         subject = page.xpath('//div[div[contains( \
             ., "Measure Title")]]/div[contains(@class, "rightside")]'
                              )[0].text.strip()
         subject = subject.replace('RELATING TO ', '')  # Remove lead text
         params['subject'] = subject.replace('.', '')
         params['description'] = page.xpath('//div[div[contains( \
             ., "Description")]]/div[contains(@class, "rightside")]'
                                            )[0].text
         params['companion'] = page.xpath('//div[div[contains( \
             ., "Companion")]]/div[contains(@class, "rightside")]')[0].text
         if params['title'] == '':
             params['title'] = params['subject']
         actions = []
         table = page.xpath('//table[tr/th[contains(., "Date")]]')[0]
         for row in table.xpath('tr[td]'):  # Ignore table header row
             # import pdb; pdb.set_trace()
             action_params = {}
             cells = row.xpath('td')
             if len(cells) == 3:
                 ch = cells[1].text
                 action_params['actor'] = house[ch]
                 action_params['action'] = cells[2].text
                 action_date = cells[0].text.split()[
                     0]  # Just get date, ignore any time.
                 try:
                     action_params['date'] = datetime.strptime(
                         action_date, "%m/%d/%y")
                 except ValueError:  # Try a YYYY format.
                     action_params['date'] = datetime.strptime(
                         action_date, "%m/%d/%Y")
                 actions.append(action_params)
         bill = Bill(**params)
         bill.add_sponsor('primary', sponsors)
         for action_params in actions:
             bill.add_action(**action_params)
     self.save_bill(bill)
     return bill

Exemplo n.º 17

0

Exibir arquivo

Arquivo: bills.py Projeto: djangolackey/openstates

    def all_scrape(self, chamber, session):
        url = ('ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt')
        file = self.urlopen(url).decode('UTF-8', 'ignore')
        count = 0
        lines = file.split('\n')
        for item in lines:
            if item and count:
                item_chamber, type, bill_number, title, title_sub_1, title_sub_2, title_sub_3, title_sub_4, \
                title_sub_5, title_sub_6, record_id, initial_sponsor, act_number, initial_date, action_date, \
                unknown_legislator, bill_id, congressional_session = item.split('|')
                congressional_session = congressional_session.strip()
                if congressional_session == session and chamber == CHAMBERS[
                        item_chamber]:
                    bill = Bill(session,
                                chamber,
                                bill_id,
                                title,
                                act_number=act_number)
                    if initial_sponsor:
                        bill.add_sponsor('primary', initial_sponsor)
                        bill.add_source(url)
                    self.save_bill(bill)
                    base_url = 'http://www.arkleg.state.ar.us/assembly/' + congressional_session[:
                                                                                                 4] + '/' + congressional_session + '/Pages/'
                    #                     try:
                    #                         html = self.urlopen(base_url + 'BillInformation.aspx?measureno=' + bill_id)
                    #                     except:
                    #                         pass

                    try:
                        html = self.urlopen(
                            base_url + 'BillStatusHistory.aspx?measureno=' +
                            bill_id)
                    except:
                        pass
                    else:
                        history = self.bill_history(bill,
                                                    lxml.html.fromstring(html))

                    try:
                        html = self.urlopen(base_url +
                                            'CoSponsors.aspx?measureno=' +
                                            bill_id)
                    except:
                        pass
                    else:
                        add_sponsors = self.add_sponsors(
                            bill, lxml.html.fromstring(html))

                    count += 1
        return count

Exemplo n.º 18

0

Exibir arquivo

Arquivo: bills.py Projeto: tomschlick/openstates

    def parse_bill(self, session, chamber, line):
        (type, combined_id, number, title, relating_to) = line.split("\xe4")
        if ((type[0] == 'H' and chamber == 'lower')
                or (type[0] == 'S' and chamber == 'upper')):

            # basic bill info
            bill_id = "%s %s" % (type, number)
            # lookup type without chamber prefix
            bill_type = self.bill_types[type[1:]]
            self.all_bills[bill_id] = Bill(session,
                                           chamber,
                                           bill_id,
                                           title,
                                           type=bill_type)

Exemplo n.º 19

0

Exibir arquivo

    def scrape_bill(self, session, chamber, bill_type, url):
        bill_html = self.get(url).text
        bill_page = lxml.html.fromstring(bill_html)
        scraped_bill_id = bill_page.xpath(
            "//a[contains(@id, 'LinkButtonMeasure')]")[0].text_content()
        bill_id = scraped_bill_id.split(' ')[0]
        versions = bill_page.xpath(
            "//table[contains(@id, 'GridViewVersions')]")[0]

        tables = bill_page.xpath("//table")
        metainf_table = bill_page.xpath(
            '//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
        action_table = bill_page.xpath(
            '//div[contains(@id, "UpdatePanel1")]//table[1]')[0]

        meta = self.parse_bill_metainf_table(metainf_table)

        subs = [s.strip() for s in meta['Report Title'].split(";")]
        if "" in subs:
            subs.remove("")

        b = Bill(session,
                 chamber,
                 bill_id,
                 title=meta['Measure Title'],
                 summary=meta['Description'],
                 referral=meta['Current Referral'],
                 subjects=subs,
                 type=bill_type)
        b.add_source(url)

        companion = meta['Companion'].strip()
        if companion:
            b['companion'] = companion

        prior = bill_page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()"
        )[-1]
        if 'carried over' in prior.lower():
            prior_session = '{} Regular Session'.format(
                str(int(session[:4]) - 1))
            b.add_companion(bill_id, prior_session, chamber)

        for sponsor in meta['Introducer(s)']:
            b.add_sponsor(type='primary', name=sponsor)

        actions = self.parse_bill_actions_table(b, action_table)
        versions = self.parse_bill_versions_table(b, versions)

        self.save_bill(b)

Exemplo n.º 20

0

Exibir arquivo

    def scrape_senate_bills(self, chamber, insert, session, year):
        doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution',
                    8: 'joint resolution'}

        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)

                page = self.urlopen(page_path)
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)

                bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)')

                bill = Bill(session, chamber, bill_id, title,
                            type=bill_type)
                bill['subjects'] = self.subject_mapping[bill_id]

                bill_text = root.xpath("string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)")
                text_url = "http://www.leg.state.nv.us" + bill_text
                bill.add_version("Bill Text", text_url,
                                 mimetype='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsor('primary', leg)
                for leg in secondary:
                    bill.add_sponsor('cosponsor', leg)


                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes =  mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda"
                    bill.add_document(minutes_date, minutes_url)
                    minutes_count = minutes_count + 1

                self.scrape_actions(root, bill, "upper")
                self.scrape_votes(page, bill, insert, year)
                bill.add_source(page_path)
                self.save_bill(bill)

Exemplo n.º 21

0

Exibir arquivo

    def scrape1995(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1995_96/leg/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Bill
            name = page.cssselect('h3 br')[0].tail.split('-', 1)[1].strip()
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version('Current',
                             url.replace('/sum/', '/fulltext/'),
                             mimetype='text/html')

            # Sponsorships
            rows = page.cssselect('center table tr')
            for row in rows:
                if row.text_content().strip() == 'Sponsor and CoSponsors':
                    continue
                if row.text_content().strip() == 'Links / Committees / Status':
                    break
                for a in row.cssselect('a'):
                    bill.add_sponsor('', a.text_content().strip())

            # Actions
            # The actions are in a pre table that looks like:
            """    SENATE                         HOUSE
                   -------------------------------------
                 1/13/95   Read 1st time          2/6/95
                 1/31/95   Favorably Reported
                 2/1/95    Read 2nd Time          2/7/95
                 2/3/95    Read 3rd Time
                 2/3/95    Passed/Adopted                   """

            actions = page.cssselect('pre')[0].text_content().split('\n')
            actions = actions[2:]
            for action in actions:
                senate_date = action[:22].strip()
                action_text = action[23:46].strip()
                house_date = action[46:].strip()

                if '/' not in senate_date and '/' not in house_date:
                    continue

                if senate_date:
                    bill.add_action('upper', action_text, senate_date)

                if house_date:
                    bill.add_action('lower', action_text, house_date)

            self.save_bill(bill)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: bills.py Projeto: Ferhub255/openstates

    def scrape_bill(self, chamber, session, doc_type, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        # bill id, title, synopsis
        bill_num = re.findall('DocNum=(\d+)', url)[0]
        bill_type = DOC_TYPES[doc_type[1:]]
        bill_id = doc_type + bill_num

        title = doc.xpath('//span[text()="Short Description:"]/following-sibling::span[1]/text()')[0].strip()
        synopsis = doc.xpath('//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/text()')[0].strip()

        bill = Bill(session, chamber, bill_id, title, type=bill_type,
                    synopsis=synopsis)

        # sponsors
        for sponsor in doc.xpath('//a[@class="content"]/text()'):
            bill.add_sponsor('cosponsor', sponsor)

        # actions
        action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td')
        for date, actor, action in group(action_tds, 3):
            date = datetime.datetime.strptime(date.text_content().strip(),
                                              "%m/%d/%Y")
            actor = actor.text_content()
            if actor == 'House':
                actor = 'lower'
            elif actor == 'Senate':
                actor = 'upper'

            action = action.text_content()

            bill.add_action(actor, action, date,
                            type=_categorize_action(action))

        # versions
        version_url = doc.xpath('//a[text()="Full Text"]/@href')[0]
        self.scrape_documents(bill, version_url)

        # if there's more than 1 votehistory link, there are votes to grab
        if len(doc.xpath('//a[contains(@href, "votehistory")]')) > 1:
            votes_url = doc.xpath('//a[text()="Votes"]/@href')[0]
            self.scrape_votes(bill, votes_url)
            bill.add_source(votes_url)

        bill.add_source(url)
        self.save_bill(bill)

Exemplo n.º 23

0

Exibir arquivo

    def scrape_bill(self, chamber, session, bill_id, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        title = doc.xpath('//h3[@class="h3billright"]')[0].text_content()
        # TODO: grab summary (none present at time of writing)

        if 'B' in bill_id:
            _type = ['bill']
        elif 'J' in bill_id:
            _type = ['joint resolution']
        else:
            raise ValueError('unknown bill type ' + bill_id)

        bill = Bill(session, chamber, bill_id, title, type=_type)
        bill.add_source(url)

        # process sponsors
        sponsors = _get_td(doc, 'All Sponsors:').text_content()
        sponsors = sponsors.replace('Delegates ', '')
        sponsors = sponsors.replace('Delegate ', '')
        sponsors = sponsors.replace('Senator ', '')
        sponsors = sponsors.replace('Senators ', '')
        sponsor_type = 'primary'

        for sponsor in re.split(', (?:and )?', sponsors):
            sponsor = sponsor.strip()
            if not sponsor:
                continue
            bill.add_sponsor(sponsor_type, sponsor)
            sponsor_type = 'cosponsor'

        # subjects
        subject_list = []
        for heading in ('Broad Subject(s):', 'Narrow Subject(s):'):
            subjects = _get_td(doc, heading).xpath('a/text()')
            subject_list += [s.split(' -see also-')[0] for s in subjects if s]
        bill['subjects'] = subject_list

        # documents
        self.scrape_documents(bill, url.replace('stab=01', 'stab=02'))
        # actions
        self.scrape_actions(bill, url.replace('stab=01', 'stab=03'))

        self.save_bill(bill)

Exemplo n.º 24

0

Exibir arquivo

Arquivo: bills.py Projeto: tyrocca/openstates

    def scrape_bill_info(self, session, chambers):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        data = self.get(info_url)
        page = open_csv(data)

        chamber_map = {'H': 'lower', 'S': 'upper'}

        for row in page:
            bill_id = row['bill_num']
            chamber = chamber_map[bill_id[0]]

            if not chamber in chambers:
                continue

            # assert that the bill data is from this session, CT is tricky
            assert row['sess_year'] == session

            if re.match(r'^(S|H)J', bill_id):
                bill_type = 'joint resolution'
            elif re.match(r'^(S|H)R', bill_id):
                bill_type = 'resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session,
                        chamber,
                        bill_id,
                        row['bill_title'],
                        type=bill_type)
            bill.add_source(info_url)

            for introducer in self._introducers[bill_id]:
                bill.add_sponsor('primary',
                                 introducer,
                                 official_type='introducer')

            try:
                self.scrape_bill_page(bill)

                bill['subjects'] = self._subjects[bill_id]

                self.bills[bill_id] = bill
            except SkipBill:
                self.warning('no such bill: ' + bill_id)
                pass

Exemplo n.º 25

0

Exibir arquivo

    def scrape_bill(self, session, chamber, bill_type, url):
        bill_html = self.urlopen(url)
        bill_page = lxml.html.fromstring(bill_html)
        scraped_bill_id = bill_page.xpath(
            "//a[contains(@id, 'LinkButtonMeasure')]")[0].text_content()
        bill_id = scraped_bill_id.split(' ')[0]
        versions = bill_page.xpath(
            "//table[contains(@id, 'GridViewVersions')]")[0]

        tables = bill_page.xpath("//table")
        metainf_table = bill_page.xpath(
            '//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
        action_table = bill_page.xpath(
            '//div[contains(@id, "UpdatePanel1")]//table[1]')[0]

        meta = self.parse_bill_metainf_table(metainf_table)

        subs = [s.strip() for s in meta['Report Title'].split(";")]
        if "" in subs:
            subs.remove("")

        b = Bill(session,
                 chamber,
                 bill_id,
                 title=meta['Measure Title'],
                 summary=meta['Description'],
                 referral=meta['Current Referral'],
                 subjects=subs,
                 type=bill_type)
        b.add_source(url)

        if not bill_id.startswith("SR"):
            return

        companion = meta['Companion'].strip()
        if companion:
            b['companion'] = companion

        for sponsor in meta['Introducer(s)']:
            b.add_sponsor(type='primary', name=sponsor)

        actions = self.parse_bill_actions_table(b, action_table)
        versions = self.parse_bill_versions_table(b, versions)

        self.save_bill(b)

Exemplo n.º 26

0

Exibir arquivo

    def scrape(self, chamber, session):

        if int(session) < 2016:
            legacy = NHLegacyBillScraper(self.metadata, self.output_dir,
                                         self.strict_validation)
            legacy.scrape(chamber, session)
            # This throws an error because object_count isn't being properly incremented,
            # even though it saves fine. So fake the output_names
            self.output_names = ['1']
            return

        self.cursor.execute(
            "SELECT legislationnbr, documenttypecode, "
            "LegislativeBody, LSRTitle, CondensedBillNo, HouseDateIntroduced, "
            "legislationID, sessionyear, lsr, SubjectCode FROM Legislation "
            "WHERE sessionyear = {} AND LegislativeBody = '{}'".format(
                session, body_code[chamber]))

        for row in self.cursor.fetchall():
            bill_id = row['CondensedBillNo']
            bill_title = row['LSRTitle'].replace('(New Title)', '').strip()

            if row['documenttypecode'] in bill_type_map:
                bill_type = bill_type_map[row['documenttypecode']]

            bill = Bill(session,
                        chamber,
                        bill_id,
                        bill_title,
                        db_id=row['legislationID'],
                        type=bill_type)

            status_url = 'http://www.gencourt.state.nh.us/bill_status/bill_'\
                'status.aspx?lsr={}&sy={}&sortoption=&txtsessionyear={}'\
                .format(row['lsr'], session, session)

            bill.add_source(status_url)

            self.scrape_actions(bill)
            self.scrape_sponsors(bill)
            self.scrape_votes(bill)
            self.scrape_subjects(bill, row['SubjectCode'])
            self.scrape_versions(bill)

            self.save_bill(bill)

Exemplo n.º 27

0

Exibir arquivo

    def scrape_bill(self, chamber, session, bill_id):
        bill_num = bill_id.split()[1]

        url = ("%s/GetLegislation?biennium=%s&billNumber"
               "=%s" % (self._base_url, self.biennium, bill_num))

        page = self.get(url)
        page = lxml.etree.fromstring(page.content)
        page = xpath(page, "//wa:Legislation")[0]

        title = xpath(page, "string(wa:LongDescription)")

        bill_type = xpath(
            page,
            "string(wa:ShortLegislationType/wa:LongLegislationType)")
        bill_type = bill_type.lower()

        if bill_type == 'gubernatorial appointment':
            return

        bill = Bill(session, chamber, bill_id, title,
                    type=[bill_type])

        fake_source = ("http://apps.leg.wa.gov/billinfo/"
                       "summary.aspx?bill=%s&year=%s" % (
                           bill_num, session[0:4]))
        bill.add_source(fake_source)

        try:
            bill['versions'] = self.versions[bill_id]
        except KeyError:
            bill['versions'] = []
            self.warning("No versions were found for {}".format(bill_id))

        try:
            bill['documents'] = self.documents[bill_num]
        except KeyError:
            pass

        self.scrape_sponsors(bill)
        self.scrape_actions(bill, bill_num)
        self.scrape_votes(bill)
        self.fix_prefiled_action_dates(bill)

        return bill

Exemplo n.º 28

0

Exibir arquivo

    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt"
        page = self.get(url).text
        page = unicode_csv_reader(StringIO.StringIO(page), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]
            if bill_chamber != chamber:
                continue

            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial resolution',
                'CMR': 'concurrent memorial resolution'}[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(session, chamber, bill_id, row[3], type=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]
            if primary:
                bill.add_sponsor('primary', primary)

            # ftp://www.arkleg.state.ar.us/Bills/
            # TODO: Keep on eye on this post 2017 to see if they apply R going forward.
            session_code = '2017R' if session == '2017' else session

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/%s.pdf" % (
                               session_code, bill_id.replace(' ', '')))
            bill.add_version(bill_id, version_url, mimetype='application/pdf')

            self.scrape_bill_page(bill)

            self.bills[bill_id] = bill

Exemplo n.º 29

0

Exibir arquivo

    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == 'upper':
            bill_no = 1
            abbr = 'SB'
        else:
            bill_no = 4001
            abbr = 'HB'
        while True:
            bill_page = self.scrape_bill(session, abbr, bill_no)
            bill_page = BeautifulSoup(bill_page)
            # if we can't find a page, we must be done. This is a healthy thing.
            if bill_page == None: return
            title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0]))
            title = title.replace('\n','').replace('\r','')
            bill_id = "%s %d" % (abbr, bill_no)

            the_bill = Bill(session, chamber, bill_id, title)

            #sponsors
            first = 0
            for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'):
                the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string)
                first = 1

            #versions
            for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'):
                r = self.parse_doc(the_bill, doc)
                if r: the_bill.add_version(*r)

            #documents
            if 'frg_billstatus_HlaTable' in str(bill_page):
                for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'):
                    r = self.parse_doc(the_bill, doc)
                    if r: the_bill.add_document(*r)
            if 'frg_billstatus_SfaSection' in str(bill_page):
                for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'):
                    r = self.parse_doc(the_bill, doc)
                    if r: the_bill.add_document(*r)

            self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0])
            self.save_bill(the_bill)
            bill_no = bill_no + 1
        pass

Exemplo n.º 30

0

Exibir arquivo

Arquivo: bills.py Projeto: ArkarNyuntWai/openstates

    def parse_bill(self, session, chamber, line):
        (type, combined_id, number, title, relating_to) = line.split(u"\xe4")
        if ((type[0] == 'H' and chamber == 'lower')
                or (type[0] == 'S' and chamber == 'upper')):

            # basic bill info
            bill_id = "%s %s" % (type, number)
            # lookup type without chamber prefix
            bill_type = self.bill_types[type[1:]]

            # may encounter an ellipsis in the source data
            title = title.replace(u'\x85', '...')

            self.all_bills[bill_id] = Bill(session,
                                           chamber,
                                           bill_id,
                                           title,
                                           type=bill_type)