示例#1
0
    def handle_list_item(self, item):
        bill_id = item.text.strip()
        title = item.xpath("string(../following-sibling::td[1])").strip()
        sponsor = item.xpath("string(../following-sibling::td[2])").strip()
        bill_url = item.attrib['href'] + '/ByCategory'

        if bill_id.startswith(('SB ', 'HB ', 'SPB ', 'HPB ')):
            bill_type = 'bill'
        elif bill_id.startswith(('HR ', 'SR ')):
            bill_type = 'resolution'
        elif bill_id.startswith(('HJR ', 'SJR ')):
            bill_type = 'joint resolution'
        elif bill_id.startswith(('SCR ', 'HCR ')):
            bill_type = 'concurrent resolution'
        elif bill_id.startswith(('SM ', 'HM ')):
            bill_type = 'memorial'
        else:
            raise ValueError('Failed to identify bill type.')

        bill = Bill(bill_id, self.kwargs['session'], title,
                    chamber='lower' if bill_id[0] == 'H' else 'upper',
                    classification=bill_type)
        bill.add_source(bill_url)

        # normalize id from HB 0004 to H4
        subj_bill_id = re.sub('(H|S)\w+ 0*(\d+)', r'\1\2', bill_id)
        bill.subject = list(self.kwargs['subjects'][subj_bill_id])

        sponsor = re.sub(r'^(?:Rep|Sen)\.\s', "", sponsor)
        for sp in sponsor.split(', '):
            bill.add_sponsorship(sp, 'primary', 'person', True)

        yield from self.scrape_page_items(BillDetail, url=bill_url, obj=bill)

        yield bill
示例#2
0
    def scrape_bill(self, chamber, session, bill_id, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        try:
            title = doc.xpath('//h3[@class="h3billright"]')[0].text_content()
            # TODO: grab summary (none present at time of writing)
        except IndexError:
            if 'Unable to retrieve the requested information. Please try again' in html:
                self.warning('Soft error page, skipping.')
                return
            else:
                raise

        if 'B' in bill_id:
            _type = ['bill']
        elif 'J' in bill_id:
            _type = ['joint resolution']
        else:
            raise ValueError('unknown bill type ' + bill_id)

        bill = Bill(
            bill_id, legislative_session=session, chamber=chamber, title=title,
            classification=_type)
        bill.add_source(url)

        # process sponsors
        sponsors = _get_td(doc, 'All Sponsors:').text_content()
        sponsors = sponsors.replace('Delegates ', '')
        sponsors = sponsors.replace('Delegate ', '')
        sponsors = sponsors.replace('Senator ', '')
        sponsors = sponsors.replace('Senators ', '')
        sponsor_type = 'primary'

        for sponsor in re.split(', (?:and )?', sponsors):
            sponsor = sponsor.strip()
            if not sponsor:
                continue
            bill.add_sponsorship(
                sponsor,
                sponsor_type,
                primary=sponsor_type == 'primary',
                entity_type='person',
            )
            sponsor_type = 'cosponsor'

        # subjects
        subject_list = []
        for heading in ('Broad Subject(s):', 'Narrow Subject(s):'):
            subjects = _get_td(doc, heading).xpath('a/text()')
            subject_list += [s.split(' -see also-')[0] for s in subjects if s]
        bill.subject = subject_list

        # documents
        yield from self.scrape_documents(bill, url.replace('stab=01', 'stab=02'))
        # actions
        self.scrape_actions(bill, url.replace('stab=01', 'stab=03'))

        yield bill
示例#3
0
文件: bills.py 项目: azban/openstates
    def scrape_bill(self, session, chamber, bill_type, url):
        bill_html = self.get(url).text
        bill_page = lxml.html.fromstring(bill_html)

        qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query))
        bill_id = "{}{}".format(qs["billtype"], qs["billnumber"])
        versions = bill_page.xpath(
            "//table[contains(@id, 'GridViewVersions')]")[0]

        metainf_table = bill_page.xpath(
            '//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
        action_table = bill_page.xpath(
            '//div[contains(@id, "UpdatePanel1")]//table[1]')[0]

        meta = self.parse_bill_metainf_table(metainf_table)

        subs = [s.strip() for s in meta["Report Title"].split(";")]
        if "" in subs:
            subs.remove("")
        b = Bill(
            bill_id,
            session,
            meta["Measure Title"],
            chamber=chamber,
            classification=bill_type,
        )
        if meta["Description"]:
            b.add_abstract(meta["Description"], "description")
        for subject in subs:
            b.add_subject(subject)
        if url:
            b.add_source(url)

        prior_session = "{} Regular Session".format(str(int(session[:4]) - 1))
        companion = meta["Companion"].strip()
        if companion:
            b.add_related_bill(
                identifier=companion.replace(u"\xa0", " "),
                legislative_session=prior_session,
                relation_type="companion",
            )
        if bill_page.xpath(
                "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()"
        ):
            prior = bill_page.xpath(
                "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()"
            )[-1]
            if "carried over" in prior.lower():
                b.add_related_bill(
                    identifier=bill_id.replace(u"\xa0", " "),
                    legislative_session=prior_session,
                    relation_type="companion",
                )
        for sponsor in meta["Introducer(s)"]:
            b.add_sponsorship(sponsor, "primary", "person", True)
        versions = self.parse_bill_versions_table(b, versions)
        yield from self.parse_bill_actions_table(b, action_table, bill_id,
                                                 session, url, chamber)
        yield b
示例#4
0
    def scrape_bill(self, session, bill_id, chamber):
        # https://malegislature.gov/Bills/189/SD2739
        session_for_url = self.replace_non_digits(session)
        bill_url = 'https://malegislature.gov/Bills/{}/{}'.format(session_for_url, bill_id)

        try:
            response = requests.get(bill_url)
        except requests.exceptions.RequestException as e:
            self.warning(u'Server Error on {}'.format(bill_url))
            return False

        html = response.text

        page = lxml.html.fromstring(html)

        if not page.xpath('//div[contains(@class, "followable")]/h1/text()'):
            self.warning(u'Server Error on {}'.format(bill_url))
            return False

        bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[0]

        bill_id = re.sub(r'[^S|H|D|\d]', '', bill_id)

        bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                    title=bill_title, classification='bill')

        bill_summary = None
        if page.xpath('//p[@id="pinslip"]/text()'):
            bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0]
        if bill_summary:
            bill.add_abstract(bill_summary, 'summary')

        bill.add_source(bill_url)

        # https://malegislature.gov/Bills/189/SD2739 has a presenter
        # https://malegislature.gov/Bills/189/S2168 no sponsor
        # Find the non-blank text of the dt following Sponsor or Presenter,
        # including any child link text.
        sponsor = page.xpath(
            '//dt[text()="Sponsor:" or text()="Presenter:"]/'
            'following-sibling::dd/descendant-or-self::*/text()[normalize-space()]')
        if sponsor:
            sponsor = sponsor[0].strip()
            bill.add_sponsorship(sponsor, classification='primary', primary=True,
                                 entity_type='person')

        self.scrape_cosponsors(bill, bill_url)

        version = page.xpath("//div[contains(@class, 'modalBtnGroup')]/"
                             "a[contains(text(), 'Download PDF') and not(@disabled)]/@href")
        if version:
            version_url = "https://malegislature.gov{}".format(version[0])
            bill.add_version_link('Bill Text', version_url, media_type='application/pdf')

        # yield back votes and bill
        yield from self.scrape_actions(bill, bill_url, session)
        yield bill
示例#5
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt"
        page = csv.reader(get_utf_16_ftp_content(url).splitlines(),
                          delimiter="|")

        for row in page:
            bill_chamber = {"H": "lower", "S": "upper"}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r"(H|S)([A-Z]+)\s", bill_id).group(2)
            bill_type = {
                "B": "bill",
                "R": "resolution",
                "JR": "joint resolution",
                "CR": "concurrent resolution",
                "MR": "memorial",
                "CMR": "concurrent memorial",
            }[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(
                bill_id,
                legislative_session=session,
                chamber=chamber,
                title=row[3],
                classification=bill_type,
            )
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(
                    primary,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/Searchable/%s.pdf" %
                           (self.slug, bill_id.replace(" ", "")))
            bill.add_version_link(bill_id,
                                  version_url,
                                  media_type="application/pdf")

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
示例#6
0
    def scrape_bills(self, session):
        session_key = SESSION_KEYS[session]
        measures_response = self.api_client.get('measures', page=500, session=session_key)

        legislators = index_legislators(self, session_key)

        for measure in measures_response:
            bid = '{} {}'.format(measure['MeasurePrefix'], measure['MeasureNumber'])

            chamber = self.chamber_code[bid[0]]
            bill = Bill(
                bid.replace(' ', ''),
                legislative_session=session,
                chamber=chamber,
                title=measure['RelatingTo'],
                classification=self.bill_types[measure['MeasurePrefix'][1:]]
            )
            bill.add_abstract(measure['MeasureSummary'].strip(), note='summary')

            for sponsor in measure['MeasureSponsors']:
                legislator_code = sponsor['LegislatoreCode']  # typo in API
                if legislator_code:
                    try:
                        legislator = legislators[legislator_code]
                    except KeyError:
                        logger.warn('Legislator {} not found in session {}'.format(
                            legislator_code, session))
                        legislator = legislator_code
                    bill.add_sponsorship(
                        name=legislator,
                        classification={'Chief': 'primary', 'Regular': 'cosponsor'}[
                            sponsor['SponsorLevel']],
                        entity_type='person',
                        primary=True if sponsor['SponsorLevel'] == 'Chief' else False
                    )

            bill.add_source(
                "https://olis.leg.state.or.us/liz/{session}/Measures/Overview/{bid}".format(
                    session=session_key, bid=bid.replace(' ', ''))
            )
            for document in measure['MeasureDocuments']:
                # TODO: probably mixing documents & versions here - should revisit
                try:
                    bill.add_version_link(document['VersionDescription'], document['DocumentUrl'],
                                          media_type='application/pdf')
                except ValueError:
                    logger.warn('Duplicate link found for {}'.format(document['DocumentUrl']))
            for action in measure['MeasureHistoryActions']:
                classifiers = self.determine_action_classifiers(action['ActionText'])
                when = datetime.datetime.strptime(action['ActionDate'], '%Y-%m-%dT%H:%M:%S')
                when = self.tz.localize(when)
                bill.add_action(action['ActionText'], when,
                                chamber=self.chamber_code[action['Chamber']],
                                classification=classifiers)

            yield bill
示例#7
0
    def scrape_bills(self, session):
        session_key = SESSION_KEYS[session]
        measures_response = self.api_client.get('measures', page=500, session=session_key)

        legislators = index_legislators(self, session_key)

        for measure in measures_response:
            bid = '{} {}'.format(measure['MeasurePrefix'], measure['MeasureNumber'])

            chamber = self.chamber_code[bid[0]]
            bill = Bill(
                bid.replace(' ', ''),
                legislative_session=session,
                chamber=chamber,
                title=measure['RelatingTo'],
                classification=self.bill_types[measure['MeasurePrefix'][1:]]
            )
            bill.add_abstract(measure['MeasureSummary'].strip(), note='summary')

            for sponsor in measure['MeasureSponsors']:
                legislator_code = sponsor['LegislatoreCode']  # typo in API
                if legislator_code:
                    try:
                        legislator = legislators[legislator_code]
                    except KeyError:
                        logger.warn('Legislator {} not found in session {}'.format(
                            legislator_code, session))
                        legislator = legislator_code
                    bill.add_sponsorship(
                        name=legislator,
                        classification={'Chief': 'primary', 'Regular': 'cosponsor'}[
                            sponsor['SponsorLevel']],
                        entity_type='person',
                        primary=True if sponsor['SponsorLevel'] == 'Chief' else False
                    )

            bill.add_source(
                "https://olis.leg.state.or.us/liz/{session}/Measures/Overview/{bid}".format(
                    session=session_key, bid=bid.replace(' ', ''))
            )
            for document in measure['MeasureDocuments']:
                # TODO: probably mixing documents & versions here - should revisit
                try:
                    bill.add_version_link(document['VersionDescription'], document['DocumentUrl'],
                                          media_type='application/pdf')
                except ValueError:
                    logger.warn('Duplicate link found for {}'.format(document['DocumentUrl']))
            for action in measure['MeasureHistoryActions']:
                classifiers = self.determine_action_classifiers(action['ActionText'])
                when = datetime.datetime.strptime(action['ActionDate'], '%Y-%m-%dT%H:%M:%S')
                when = self.tz.localize(when)
                bill.add_action(action['ActionText'], when,
                                chamber=self.chamber_code[action['Chamber']],
                                classification=classifiers)

            yield bill
示例#8
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt"
        page = self.get(url).text
        page = unicode_csv_reader(StringIO(page), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial',
                'CMR': 'concurrent memorial'
            }[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(bill_id,
                        legislative_session=session,
                        chamber=chamber,
                        title=row[3],
                        classification=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(primary,
                                     classification='primary',
                                     entity_type='person',
                                     primary=True)
            # ftp://www.arkleg.state.ar.us/Bills/
            # TODO: Keep on eye on this post 2017 to see if they apply R going forward.
            session_code = '2017R' if session == '2017' else session

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/%s.pdf" %
                           (session_code, bill_id.replace(' ', '')))
            bill.add_version_link(bill_id,
                                  version_url,
                                  media_type='application/pdf')

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
示例#9
0
    def scrape_bill_info(self, session, chambers):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        data = self.get(info_url)
        page = open_csv(data)

        chamber_map = {"H": "lower", "S": "upper"}

        for row in page:
            bill_id = row["bill_num"]
            chamber = chamber_map[bill_id[0]]

            if chamber not in chambers:
                continue

            if re.match(r"^(S|H)J", bill_id):
                bill_type = "joint resolution"
            elif re.match(r"^(S|H)R", bill_id):
                bill_type = "resolution"
            else:
                bill_type = "bill"

            bill = Bill(
                identifier=bill_id,
                legislative_session=session,
                title=row["bill_title"],
                classification=bill_type,
                chamber=chamber,
            )
            bill.add_source(info_url)

            for introducer in self._introducers[bill_id]:
                introducer = string.capwords(
                    introducer.decode("utf-8").replace("Rep. ", "").replace(
                        "Sen. ", ""))
                if "Dist." in introducer:
                    introducer = " ".join(introducer.split()[:-2])
                bill.add_sponsorship(
                    name=introducer,
                    classification="primary",
                    primary=True,
                    entity_type="person",
                )

            try:
                for subject in self._subjects[bill_id]:
                    bill.subject.append(subject)

                self.bills[bill_id] = [bill, chamber]

                yield from self.scrape_bill_page(bill)
            except SkipBill:
                self.warning("no such bill: " + bill_id)
                pass
示例#10
0
    def scrape_bill(self, chamber, session, bill_id, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        title = doc.xpath('//h3[@class="h3billright"]')[0].text_content()
        # TODO: grab summary (none present at time of writing)

        if 'B' in bill_id:
            _type = ['bill']
        elif 'J' in bill_id:
            _type = ['joint resolution']
        else:
            raise ValueError('unknown bill type ' + bill_id)

        bill = Bill(
            bill_id, legislative_session=session, chamber=chamber, title=title,
            classification=_type)
        bill.add_source(url)

        # process sponsors
        sponsors = _get_td(doc, 'All Sponsors:').text_content()
        sponsors = sponsors.replace('Delegates ', '')
        sponsors = sponsors.replace('Delegate ', '')
        sponsors = sponsors.replace('Senator ', '')
        sponsors = sponsors.replace('Senators ', '')
        sponsor_type = 'primary'

        for sponsor in re.split(', (?:and )?', sponsors):
            sponsor = sponsor.strip()
            if not sponsor:
                continue
            bill.add_sponsorship(
                sponsor,
                sponsor_type,
                primary=sponsor_type == 'primary',
                entity_type='person',
            )
            sponsor_type = 'cosponsor'

        # subjects
        subject_list = []
        for heading in ('Broad Subject(s):', 'Narrow Subject(s):'):
            subjects = _get_td(doc, heading).xpath('a/text()')
            subject_list += [s.split(' -see also-')[0] for s in subjects if s]
        bill.subject = subject_list

        # documents
        yield from self.scrape_documents(bill, url.replace('stab=01', 'stab=02'))
        # actions
        self.scrape_actions(bill, url.replace('stab=01', 'stab=03'))

        yield bill
示例#11
0
    def scrape_bill(self, bill_url, bill_id, session_id):
        page = self.lxmlize(bill_url)

        # create bill
        title = page.xpath("//em/text()")[0]
        bill = Bill(identifier=bill_id,
                    legislative_session=session_id,
                    title=title)
        bill.add_source(bill_url, note="detail")

        # add additional fields

        data_table = page.xpath("//table[@class='data vertical_table']")[0]

        # sponsor
        sponsor_name = data_table.xpath(self.bill_table_query("Sponsor"))[0]
        bill.add_sponsorship(name=sponsor_name,
                             classification="Primary",
                             entity_type="person",
                             primary=True)

        # abstract
        try:
            summary = data_table.xpath(self.bill_table_query("Summary"))[0]
            bill.add_abstract(abstract=summary, note="summary")
            # TODO trim whitespace from summary
        except IndexError:
            print("No summary for bill {} in session {}".format(
                bill_id, session_id))

        # actions
        action_lines = data_table.xpath(self.bill_table_query("Actions"))
        for line in action_lines:
            try:
                for date_str, action_type in self.parse_actions(line):
                    bill.add_action(date=date_str,
                                    description=action_type,
                                    classification=action_type)
                    print("added action: {}".format(action_type))
            except ValueError:
                print("failed to parse these actions: {}".format([line]))

        # co-sponsors
        co_sponsors = data_table.xpath(self.bill_table_query("Co-Sponsors"))
        co_sponsors = [name.strip() for name in co_sponsors if name.strip()]
        for name in co_sponsors:
            bill.add_sponsorship(name=name,
                                 classification="co-sponsor",
                                 entity_type="person",
                                 primary=False)

        return bill
示例#12
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt"
        page = csv.reader(get_utf_16_ftp_content(url).splitlines(),
                          delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial',
                'CMR': 'concurrent memorial'
            }[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(bill_id,
                        legislative_session=session,
                        chamber=chamber,
                        title=row[3],
                        classification=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(primary,
                                     classification='primary',
                                     entity_type='person',
                                     primary=True)

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/Searchable/%s.pdf" %
                           (self.slug, bill_id.replace(' ', '')))
            bill.add_version_link(bill_id,
                                  version_url,
                                  media_type='application/pdf')

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
示例#13
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt"
        page = self.get(url).text
        page = unicode_csv_reader(StringIO(page), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial',
                'CMR': 'concurrent memorial'}[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(bill_id, legislative_session=session,
                        chamber=chamber, title=row[3], classification=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(primary, classification='primary',
                                     entity_type='person', primary=True)
            # ftp://www.arkleg.state.ar.us/Bills/
            # TODO: Keep on eye on this post 2017 to see if they apply R going forward.
            session_code = '2017R' if session == '2017' else session

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/%s.pdf" % (
                               session_code, bill_id.replace(' ', '')))
            bill.add_version_link(bill_id, version_url, media_type='application/pdf')

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
示例#14
0
    def scrape_bill_info(self, session, chambers):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        data = self.get(info_url)
        page = open_csv(data)

        chamber_map = {'H': 'lower', 'S': 'upper'}

        for row in page:
            bill_id = row['bill_num']
            chamber = chamber_map[bill_id[0]]

            if chamber not in chambers:
                continue

            # assert that the bill data is from this session, CT is tricky
            assert row['sess_year'] == session

            if re.match(r'^(S|H)J', bill_id):
                bill_type = 'joint resolution'
            elif re.match(r'^(S|H)R', bill_id):
                bill_type = 'resolution'
            else:
                bill_type = 'bill'

            bill = Bill(identifier=bill_id,
                        legislative_session=session,
                        title=row['bill_title'],
                        classification=bill_type,
                        chamber=chamber)
            bill.add_source(info_url)

            for introducer in self._introducers[bill_id]:
                bill.add_sponsorship(name=str(introducer),
                                     classification='primary',
                                     primary=True,
                                     entity_type='person')

            try:
                for subject in self._subjects[bill_id]:
                    bill.subject.append(subject)

                self.bills[bill_id] = [bill, chamber]

                yield from self.scrape_bill_page(bill)
            except SkipBill:
                self.warning('no such bill: ' + bill_id)
                pass
示例#15
0
    def scrape_bill_info(self, session, chambers):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        data = self.get(info_url)
        page = open_csv(data)

        chamber_map = {'H': 'lower', 'S': 'upper'}

        for row in page:
            bill_id = row['bill_num']
            chamber = chamber_map[bill_id[0]]

            if chamber not in chambers:
                continue

            # assert that the bill data is from this session, CT is tricky
            assert row['sess_year'] == session

            if re.match(r'^(S|H)J', bill_id):
                bill_type = 'joint resolution'
            elif re.match(r'^(S|H)R', bill_id):
                bill_type = 'resolution'
            else:
                bill_type = 'bill'

            bill = Bill(identifier=bill_id,
                        legislative_session=session,
                        title=row['bill_title'],
                        classification=bill_type,
                        chamber=chamber)
            bill.add_source(info_url)

            for introducer in self._introducers[bill_id]:
                bill.add_sponsorship(name=introducer.decode('utf-8'),
                                     classification='primary',
                                     primary=True,
                                     entity_type='person')

            try:
                for subject in self._subjects[bill_id]:
                    bill.subject.append(subject)

                self.bills[bill_id] = [bill, chamber]

                yield from self.scrape_bill_page(bill)
            except SkipBill:
                self.warning('no such bill: ' + bill_id)
                pass
示例#16
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt"
        page = csv.reader(get_utf_16_ftp_content(url).splitlines(), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial',
                'CMR': 'concurrent memorial'}[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(bill_id, legislative_session=session,
                        chamber=chamber, title=row[3], classification=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(primary, classification='primary',
                                     entity_type='person', primary=True)

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/Searchable/%s.pdf" % (
                               self.slug, bill_id.replace(' ', '')))
            bill.add_version_link(bill_id, version_url, media_type='application/pdf')

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
示例#17
0
    def scrape_bill(self, session, chamber, bill_type, url):
        bill_html = self.get(url).text
        bill_page = lxml.html.fromstring(bill_html)

        qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query))
        bill_id = '{}{}'.format(qs['billtype'], qs['billnumber'])
        versions = bill_page.xpath("//table[contains(@id, 'GridViewVersions')]")[0]

        metainf_table = bill_page.xpath('//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
        action_table = bill_page.xpath('//div[contains(@id, "UpdatePanel1")]//table[1]')[0]

        meta = self.parse_bill_metainf_table(metainf_table)

        subs = [s.strip() for s in meta['Report Title'].split(";")]
        if "" in subs:
            subs.remove("")
        b = Bill(bill_id, session, meta['Measure Title'],
                 chamber=chamber,
                 classification=bill_type)
        if meta['Description']:
            b.add_abstract(meta['Description'], 'description')
        for subject in subs:
            b.add_subject(subject)
        if url:
            b.add_source(url)

        prior_session = '{} Regular Session'.format(str(int(session[:4]) - 1))
        companion = meta['Companion'].strip()
        if companion:
            b.add_related_bill(identifier=companion.replace(u'\xa0', ' '),
                               legislative_session=prior_session,
                               relation_type="companion")
        prior = bill_page.xpath(
            "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()")[-1]
        if 'carried over' in prior.lower():
            b.add_related_bill(identifier=bill_id.replace(u'\xa0', ' '),
                               legislative_session=prior_session,
                               relation_type="companion")
        for sponsor in meta['Introducer(s)']:
            b.add_sponsorship(sponsor, 'primary', 'person', True)
        versions = self.parse_bill_versions_table(b, versions)
        yield from self.parse_bill_actions_table(b, action_table, bill_id, session, url, chamber)
        yield b
示例#18
0
    def createBill(self, agenda_item):
        title = agenda_item['Title'].replace('\n', ' ')
        title, primary_role, primary_sponsor, secondary_role, secondary_sponsor = re.match(agenda_item_title_re, title).groups()

        bill = {
            'identifier': agenda_item['Item No.'],
            'title': title,
            'legislative_session': agenda_item['session'],
            # TODO: Add agenda_item type to OCD
            'classification': 'bill',
            'from_organization': {'name': self.jurisdiction.name},
        }

        b = Bill(**bill)
        b.add_source(agenda_item['url'], note='web')

        if primary_sponsor and secondary_sponsor:
            b.add_sponsorship(primary_sponsor, 'mover', 'person', True)
            b.add_sponsorship(secondary_sponsor, 'seconder', 'person', False)

        return b
示例#19
0
    def handle_list_item(self, item):
        bill_id = item.text.strip()
        title = item.xpath("string(../following-sibling::td[1])").strip()
        sponsor = item.xpath("string(../following-sibling::td[2])").strip()
        bill_url = item.attrib["href"] + "/ByCategory"

        if bill_id.startswith(("SB ", "HB ", "SPB ", "HPB ")):
            bill_type = "bill"
        elif bill_id.startswith(("HR ", "SR ")):
            bill_type = "resolution"
        elif bill_id.startswith(("HJR ", "SJR ")):
            bill_type = "joint resolution"
        elif bill_id.startswith(("SCR ", "HCR ")):
            bill_type = "concurrent resolution"
        elif bill_id.startswith(("SM ", "HM ")):
            bill_type = "memorial"
        else:
            raise ValueError("Failed to identify bill type.")

        bill = Bill(
            bill_id,
            self.kwargs["session"],
            title,
            chamber="lower" if bill_id[0] == "H" else "upper",
            classification=bill_type,
        )
        bill.add_source(bill_url)

        # normalize id from HB 0004 to H4
        subj_bill_id = re.sub(r"(H|S)\w+ 0*(\d+)", r"\1\2", bill_id)
        bill.subject = list(self.kwargs["subjects"][subj_bill_id])

        sponsor = re.sub(r"^(?:Rep|Sen)\.\s", "", sponsor)
        for sp in sponsor.split(", "):
            sp = sp.strip()
            bill.add_sponsorship(sp, "primary", "person", True)

        yield from self.scrape_page_items(BillDetail, url=bill_url, obj=bill)

        yield bill
示例#20
0
    def createBill(self, agenda_item):
        title = agenda_item['Title'].replace('\n', ' ')
        title, primary_role, primary_sponsor, secondary_role, secondary_sponsor = re.match(
            agenda_item_title_re, title).groups()

        bill = {
            'identifier': agenda_item['Item No.'],
            'title': title,
            'legislative_session': agenda_item['session'],
            # TODO: Add agenda_item type to OCD
            'classification': 'bill',
            'from_organization': {
                'name': self.jurisdiction.name
            },
        }

        b = Bill(**bill)
        b.add_source(agenda_item['url'], note='web')

        if primary_sponsor and secondary_sponsor:
            b.add_sponsorship(primary_sponsor, 'mover', 'person', True)
            b.add_sponsorship(secondary_sponsor, 'seconder', 'person', False)

        return b
示例#21
0
    def scrape_bill(self, session, chamber, url):
        html = self.get(url).content
        page = lxml.html.fromstring(html)

        title = page.xpath('//div[@id="main_0_header"]//h1/text()')[0].strip()

        parsed = urlparse.urlparse(url)
        bill_id = urlparse.parse_qs(parsed.query)['bId'][0]

        portfolio = self.dd(page, 'Portfolio')
        orig_house = self.dd(page, 'Originating house')
        print(bill_id, title, portfolio, orig_house)

        bill_chamber = self.CHAMBERS[orig_house]

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=bill_chamber,
                    title=title,
                    classification='bill')


        sponsor = self.dd(page, 'Sponsor(s)')
        if sponsor:
            bill.add_sponsorship(name=sponsor,
                                classification="Primary",
                                entity_type="person",
                                primary=True)

        self.scrape_bill_actions(page, bill)
        self.scrape_bill_versions(page, bill)
        self.scrape_bill_documents(page, bill)

        bill.add_source(url)

        yield bill
示例#22
0
    def scrape_bill_list(self, url):
        bill_list = self._get_bill_list(url)

        for bill_info in bill_list:

            (bill_id, ) = bill_info.xpath('td[1]/font/input/@value')
            (sponsor, ) = bill_info.xpath('td[2]/font/input/@value')
            (subject, ) = bill_info.xpath('td[3]//text()')
            subject = subject.strip()
            chamber = self.CHAMBERS[bill_id[0]]

            if 'B' in bill_id:
                bill_type = 'bill'
            elif 'JR' in bill_id:
                bill_type = 'joint resolution'
            elif 'R' in bill_id:
                bill_type = 'resolution'
            else:
                raise AssertionError(
                    "Unknown bill type for bill '{}'".format(bill_id))

            bill = Bill(
                bill_id,
                legislative_session=self.session,
                chamber=chamber,
                title='',
                classification=bill_type,
            )
            if subject:
                bill.subject = [subject]
            if sponsor:
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )
            bill.add_source(url)

            bill_url = ('http://alisondb.legislature.state.al.us/Alison/'
                        'SESSBillStatusResult.aspx?BILL={}'.format(bill_id))
            bill.add_source(bill_url)

            bill_html = self._get_bill_response(bill_url)
            if bill_html is None:
                self.warning(
                    "Bill {} has no webpage, and will be skipped".format(
                        bill_id))
                continue
            bill_doc = lxml.html.fromstring(bill_html)

            if (bill_doc.xpath(
                    '//span[@id="ContentPlaceHolder1_lblShotTitle"]')):
                title = bill_doc.xpath(
                    '//span[@id="ContentPlaceHolder1_lblShotTitle"]'
                )[0].text_content().strip()
            if not title:
                title = "[No title given by state]"
            bill.title = title

            version_url_base = (
                'http://alisondb.legislature.state.al.us/ALISON/'
                'SearchableInstruments/{0}/PrintFiles/{1}-'.format(
                    self.session, bill_id))
            versions = bill_doc.xpath(
                '//table[@class="box_versions"]/tr/td[2]/font/text()')
            for version in versions:
                name = version
                if version == "Introduced":
                    version_url = version_url_base + 'int.pdf'
                elif version == "Engrossed":
                    version_url = version_url_base + 'eng.pdf'
                elif version == "Enrolled":
                    version_url = version_url_base + 'enr.pdf'
                else:
                    raise NotImplementedError(
                        "Unknown version type found: '{}'".format(name))

                bill.add_version_link(
                    name,
                    version_url,
                    media_type='application/pdf',
                    on_duplicate='ignore',
                )

            # Fiscal notes exist, but I can't figure out how to build their URL
            fiscal_notes = bill_doc.xpath(
                '//table[@class="box_fiscalnote"]')[1:]
            for fiscal_note in fiscal_notes:
                pass

            # Budget Isolation Resolutions are handled as extra actions/votes
            birs = bill_doc.xpath(
                '//div[@class="box_bir"]//table//table/tr')[1:]
            for bir in birs:
                bir_action = bir.xpath('td[1]')[0].text_content().strip()
                # Sometimes ALISON's database puts another bill's
                # actions into the BIR action list; ignore these
                if bill_id not in bir_action:
                    self.warning(
                        "BIR action found ({}) ".format(bir_action) +
                        "that doesn't match the bill ID ({})".format(bill_id))
                    continue

                bir_date = datetime.datetime.strptime(
                    bir.xpath('td[2]/font/text()')[0], self.DATE_FORMAT)
                bir_type = bir.xpath('td[1]/font/text()')[0].split(" ")[0]
                bir_chamber = self.CHAMBERS[bir_type[0]]
                bir_text = "{0}: {1}".format(
                    bir_type,
                    bir.xpath('td[3]/font/text()')[0].strip())

                bill.add_action(
                    bir_text,
                    TIMEZONE.localize(bir_date),
                    chamber=bir_chamber,
                    classification='other',
                )

                try:
                    (bir_vote_id, ) = bir.xpath('td[4]/font/input/@value')
                except ValueError:
                    bir_vote_id = ''

                bir_vote_id = bir_vote_id.strip()
                if bir_vote_id.startswith("Roll "):
                    bir_vote_id = bir_vote_id.split(" ")[-1]

                    yield from self.scrape_vote(
                        bill=bill,
                        vote_chamber=bir_type[0],
                        bill_id="{0}%20for%20{1}".format(bir_type, bill_id),
                        vote_id=bir_vote_id,
                        vote_date=TIMEZONE.localize(bir_date),
                        action_text=bir_text)

            actions = bill_doc.xpath(
                '//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:]
            action_date = None
            for action in actions:
                # If actions occur on the same day, only one date will exist
                if (action.xpath('td[1]/font/text()')[0].encode(
                        'ascii', 'ignore').strip()):
                    action_date = datetime.datetime.strptime(
                        action.xpath('td[1]/font/text()')[0], self.DATE_FORMAT)

                (action_chamber, ) = action.xpath('td[2]/font/text()')

                if action.xpath('td[3]/font/u/text()'):
                    (amendment, ) = action.xpath('td[3]/font/u/text()')
                else:
                    amendment = None

                (action_text, ) = action.xpath('td[4]/font/text()')

                action_type = _categorize_action(action_text)

                # check for occasional extra last row
                if not action_chamber.strip():
                    continue

                # The committee cell is just an abbreviation, so get its name
                actor = self.CHAMBERS[action_chamber]
                try:
                    action_committee = re.search(
                        r'.*? referred to the .*? committee on (.*?)$',
                        action_text).group(1).strip()
                except AttributeError:
                    action_committee = ''

                act = bill.add_action(
                    action_text,
                    TIMEZONE.localize(action_date),
                    chamber=actor,
                    classification=action_type,
                )
                if action_committee:
                    act.add_related_entity(action_committee,
                                           entity_type='organization')

                try:
                    vote_button = action.xpath('td[9]//text()')[0].strip()
                except IndexError:
                    vote_button = ''

                if vote_button.startswith("Roll "):
                    vote_id = vote_button.split(" ")[-1]

                    yield from self.scrape_vote(
                        bill=bill,
                        vote_chamber=action_chamber,
                        bill_id=bill_id,
                        vote_id=vote_id,
                        vote_date=TIMEZONE.localize(action_date),
                        action_text=action_text)

                if amendment:
                    amend_url = (
                        'http://alisondb.legislature.state.al.us/ALISON/'
                        'SearchableInstruments/{0}/PrintFiles/{1}.pdf'.format(
                            self.session, amendment))

                    amend_name = 'Amd/Sub {}'.format(amendment)

                    bill.add_version_link(
                        amend_name,
                        amend_url,
                        media_type='application/pdf',
                        on_duplicate='ignore',
                    )

            yield bill
示例#23
0
    def scrape(self, session=None, chambers=None):
        # Bills endpoint can sometimes take a very long time to load
        self.timeout = 300

        if not session:
            session = self.latest_session()
            self.info('no session, using %s', session)

        if int(session) < 128:
            raise AssertionError("No data for period {}".format(session))

        elif int(session) < 131:
            # they changed their data format starting in 131st and added
            # an undocumented API
            yield from self.old_scrape(session)

        else:
            chamber_dict = {"Senate": "upper", "House": "lower",
                            "House of Representatives": "lower",
                            "house": "lower", "senate": "upper"}

            # so presumanbly not everything passes, but we haven't
            # seen anything not pass yet, so we'll need to wait
            # till it fails and get the right language in here
            vote_results = {"approved": True,
                            "passed": True,
                            "adopted": True,
                            "true": True,
                            "false": False,
                            "failed": False,
                            True: True,
                            False: False}

            action_dict = {"ref_ctte_100": "referral-committee",
                           "intro_100": "introduction",
                           "pass_300": "passage",
                           "intro_110": "reading-1",
                           "refer_210": "referral-committee",
                           "crpt_301": None,
                           "crpt_317": None,
                           "concur_606": "passage",
                           "pass_301": "passage",
                           "refer_220": "referral-committee",
                           "intro_102": ["introduction", "passage"],
                           "intro_105": ["introduction", "passage"],
                           "intro_ref_ctte_100": "referral-committee",
                           "refer_209": None,
                           "intro_108": ["introduction", "passage"],
                           "intro_103": ["introduction", "passage"],
                           "msg_reso_503": "passage",
                           "intro_107": ["introduction", "passage"],
                           "imm_consid_360": "passage",
                           "refer_213": None,
                           "adopt_reso_100": "passage",
                           "msg_507": "amendment-passage",
                           "confer_713": None,
                           "concur_603": None,
                           "confer_712": None,
                           "msg_506": "amendment-failure",
                           "receive_message_100": "passage",
                           "motion_920": None,
                           "concur_611": None,
                           "confer_735": None
                           }

            base_url = "http://search-prod.lis.state.oh.us"
            first_page = base_url
            first_page += "/solarapi/v1/general_assembly_{session}/".format(session=session)
            legislators = self.get_legislator_ids(first_page)
            all_amendments = self.get_other_data_source(first_page, base_url, "amendments")
            all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals")
            all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss")
            all_analysis = self.get_other_data_source(first_page, base_url, "analysiss")

            for row in self.get_bill_rows(session):
                number_link, ga, title, primary_sponsor, status = row.xpath('td')

                bill_id = number_link.text_content()
                title = title.text_content().strip()
                chamber = 'lower' if 'H' in bill_id else 'upper'
                classification = 'bill' if 'B' in bill_id else 'resolution'

                bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                            title=title, classification=classification)
                bill.add_source(number_link.xpath('a/@href')[0])

                # get bill from API
                bill_api_url = ('http://search-prod.lis.state.oh.us/solarapi/v1/'
                                'general_assembly_{}/{}/{}/'.format(
                                    session,
                                    'bills' if 'B' in bill_id else 'resolutions',
                                    bill_id.lower().replace(' ', '')
                                ))
                data = self.get(bill_api_url).json()

                # add title if no short title
                if not bill.title:
                    bill.title = data['items'][0]['longtitle']
                bill.add_title(data['items'][0]['longtitle'], 'long title')

                # this stuff is version-specific
                for version in data['items']:
                    version_name = version["version"]
                    version_link = base_url+version["pdfDownloadLink"]
                    bill.add_version_link(version_name, version_link, media_type='application/pdf')

                # we'll use latest bill_version for everything else
                bill_version = data['items'][0]
                bill.add_source(bill_api_url)

                # subjects
                for subj in bill_version["subjectindexes"]:
                    try:
                        bill.add_subject(subj["primary"])
                    except KeyError:
                        pass
                    try:
                        secondary_subj = subj["secondary"]
                    except KeyError:
                        secondary_subj = ""
                    if secondary_subj:
                        bill.add_subject(secondary_subj)

                # sponsors
                sponsors = bill_version["sponsors"]
                for sponsor in sponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                                        sponsor_name,
                                        classification='primary',
                                        entity_type='person',
                                        primary=True
                        )

                cosponsors = bill_version["cosponsors"]
                for sponsor in cosponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                                         sponsor_name,
                                         classification='cosponsor',
                                         entity_type='person',
                                         primary=False,
                        )

                try:
                    action_doc = self.get(base_url+bill_version["action"][0]["link"])
                except scrapelib.HTTPError:
                    pass
                else:

                    actions = action_doc.json()
                    for action in reversed(actions["items"]):
                        actor = chamber_dict[action["chamber"]]
                        action_desc = action["description"]
                        try:
                            action_type = action_dict[action["actioncode"]]
                        except KeyError:
                            self.warning("Unknown action {desc} with code {code}."
                                         " Add it to the action_dict"
                                         ".".format(desc=action_desc,
                                                    code=action["actioncode"]))
                            action_type = None

                        date = self._tz.localize(datetime.datetime.strptime(
                                                 action["datetime"],
                                                 "%Y-%m-%dT%H:%M:%S"))
                        date = "{:%Y-%m-%d}".format(date)

                        bill.add_action(action_desc,
                                        date, chamber=actor,
                                        classification=action_type)

                # attach documents gathered earlier
                self.add_document(all_amendments, bill_id, "amendment", bill, base_url)
                self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url)
                self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url)
                self.add_document(all_analysis, bill_id, "analysis", bill, base_url)

                # votes
                vote_url = base_url+bill_version["votes"][0]["link"]
                vote_doc = self.get(vote_url)
                votes = vote_doc.json()
                yield from self.process_vote(votes, vote_url,
                                             base_url, bill, legislators,
                                             chamber_dict, vote_results)

                vote_url = base_url
                vote_url += bill_version["cmtevotes"][0]["link"]
                try:
                    vote_doc = self.get(vote_url)
                except scrapelib.HTTPError:
                    self.warning("Vote page not "
                                 "loading; skipping: {}".format(vote_url))
                    continue
                votes = vote_doc.json()
                yield from self.process_vote(votes, vote_url,
                                             base_url, bill, legislators,
                                             chamber_dict, vote_results)

                # we have never seen a veto or a disapprove, but they seem important.
                # so we'll check and throw an error if we find one
                # life is fragile. so are our scrapers.
                if "veto" in bill_version:
                    veto_url = base_url+bill_version["veto"][0]["link"]
                    veto_json = self.get(veto_url).json()
                    if len(veto_json["items"]) > 0:
                        raise AssertionError("Whoa, a veto! We've never"
                                             " gotten one before."
                                             " Go write some code to deal"
                                             " with it: {}".format(veto_url))

                if "disapprove" in bill_version:
                    disapprove_url = base_url+bill_version["disapprove"][0]["link"]
                    disapprove_json = self.get(disapprove_url).json()
                    if len(disapprove_json["items"]) > 0:
                        raise AssertionError("Whoa, a disapprove! We've never"
                                             " gotten one before."
                                             " Go write some code to deal "
                                             "with it: {}".format(disapprove_url))

                yield bill
示例#24
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(' ', ''))
        bill_page = self.get(url, verify=False).text
        html = lxml.html.fromstring(bill_page)
        html.make_links_absolute('http://legislature.idaho.gov/legislation/%s/' % session)
        bill_tables = html.xpath('//table[contains(@class, "bill-table")]')
        title = bill_tables[1].text_content().strip()
        bill_type = get_bill_type(bill_id)
        bill = Bill(legislative_session=session, chamber=chamber, identifier=bill_id, title=title,
                    classification=bill_type)
        bill.add_source(url)
        for subject in self._subjects[bill_id.replace(' ', '')]:
            bill.add_subject(subject)

        if short_title and title.lower() != short_title.lower():
            bill.add_title(short_title, 'short title')

        # documents
        doc_links = html.xpath('//div[contains(@class,"pf-content")]//a')
        for link in doc_links:
            name = link.text_content().strip()
            href = link.get('href')
            if 'Engrossment' in name or 'Bill Text' in name:
                bill.add_version_link(note=name, url=href, media_type="application/pdf")
            else:
                bill.add_document_link(note=name, url=href, media_type="application/pdf")

        def _split(string):
            return re.split(r"\w+[,|AND]\s+", string)

        # sponsors range from a committee to one legislator to a group of legs
        sponsor_lists = bill_tables[0].text_content().split('by')
        if len(sponsor_lists) > 1:
            for sponsors in sponsor_lists[1:]:
                if 'COMMITTEE' in sponsors.upper():
                    bill.add_sponsorship(name=sponsors.strip(), entity_type="organization",
                                         primary=True, classification='primary')
                else:
                    for person in _split(sponsors):
                        person = person.strip()
                        if person != "":
                            bill.add_sponsorship(classification='primary', name=person,
                                                 entity_type="person", primary=True)

        actor = chamber
        last_date = None
        for row in bill_tables[2]:
            # lots of empty rows
            if len(row) == 1:
                continue
            _, date, action, _ = [x.text_content().strip() for x in row]

            if date:
                last_date = date
            else:
                date = last_date
            date = datetime.datetime.strptime(date + '/' + session[0:4],
                                              "%m/%d/%Y").strftime('%Y-%m-%d')
            if action.startswith('House'):
                actor = 'lower'
            elif action.startswith('Senate'):
                actor = 'upper'

            # votes
            if 'AYES' in action or 'NAYS' in action:
                yield from self.parse_vote(actor, date, row[2], session, bill_id, chamber, url)
                # bill.add_vote_event(vote)
            # some td's text is seperated by br elements
            if len(row[2]):
                action = "".join(row[2].itertext())
            action = action.replace(u'\xa0', ' ').strip()
            atype = get_action(actor, action)
            bill.add_action(action, date, chamber=actor, classification=atype)
            # after voice vote/roll call and some actions the bill is sent
            # 'to House' or 'to Senate'
            if 'to House' in action:
                actor = 'lower'
            elif 'to Senate' in action:
                actor = 'upper'
        yield bill
示例#25
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.get(url).text)
        except scrapelib.HTTPError as e:
            self.warning("error (%s) fetching %s, skipping" % (e, url))
            return

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])"
        ).strip()
        if not title:
            self.warning("blank bill on %s - skipping", url)
            return

        if "JR" in bill_id:
            bill_type = ["joint resolution"]
        elif "CR" in bill_id:
            bill_type = ["concurrent resolution"]
        elif "R" in bill_id:
            bill_type = ["resolution"]
        else:
            bill_type = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)
        bill.subject = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if ":" in name:
                raise Exception(name)
            if "otherAuth" in link.attrib["id"]:
                bill.add_sponsorship(
                    name,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )
            else:
                bill.add_sponsorship(
                    name, classification="primary", entity_type="person", primary=True
                )

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == "None":
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == "H":
                actor = "lower"
            elif actor == "S":
                actor = "upper"

            attrs = self.categorizer.categorize(action)
            related_entities = []
            for item in attrs["committees"]:
                related_entities.append({"type": "committee", "name": item})
            for item in attrs["legislators"]:
                related_entities.append({"type": "legislator", "name": item})
            bill.add_action(
                description=action,
                date=date.strftime("%Y-%m-%d"),
                chamber=actor,
                classification=attrs["classification"],
                related_entities=related_entities,
            )

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        # Keep track of already seen versions to prevent processing duplicates.
        version_urls = []
        for link in version_table.xpath(".//a[contains(@href, '.PDF')]"):
            version_url = link.attrib["href"]
            if version_url in version_urls:
                self.warning("Skipping duplicate version URL.")
                continue
            else:
                version_urls.append(version_url)
            name = link.text.strip()

            if re.search("COMMITTEE REPORTS|SCHEDULED CCR", version_url, re.IGNORECASE):
                bill.add_document_link(
                    note=name, url=version_url, media_type="application/pdf"
                )
                continue

            bill.add_version_link(
                note=name, url=version_url, media_type="application/pdf"
            )

        self.scrape_amendments(bill, page)

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            if "HT_" not in link.attrib["href"]:
                yield from self.scrape_votes(bill, self.urlescape(link.attrib["href"]))

        # # If the bill has no actions and no versions, it's a bogus bill on
        # # their website, which appears to happen occasionally. Skip.
        has_no_title = bill.title == "Short Title Not Found."
        if has_no_title:
            # If there's no title, this is an empty page. Skip!
            return

        else:
            # Otherwise, save the bills.
            yield bill
示例#26
0
    def scrape_bill_type(self,
                         chamber,
                         session,
                         bill_type,
                         type_abbr,
                         committee_abbr_regex=get_committee_name_regex()):
        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_id, session, title='', chamber=chamber)
            if ((bill_id.startswith('S') and chamber == 'lower')
                    or (bill_id.startswith('A') and chamber == 'upper')):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ('http://leginfo.legislature.ca.gov/faces/'
                          'billNavClient.xhtml?bill_id=%s') % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id,
                                    source_url,
                                    media_type='text/html')

            title = ''
            type_ = ['bill']
            subject = ''
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = '//caml:DigestText/xhtml:p'
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r'\s+', ' ', t)
                    t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t)
                    chunks.append(t)
                summary = '\n\n'.join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(
                    version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime('%m/%d/%y')
                version_name = "{} - {}".format(version_date_human,
                                                version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(version_name,
                                        version_url_pdf,
                                        media_type='application/pdf',
                                        date=version_date.date())

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ('AB', 'SB'):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(version.short_title) and \
                            not version.title.lower().startswith('an act'):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == 'Yes':
                    type_.append('appropriation')

                tags = []
                if version.fiscal_committee == 'Yes':
                    tags.append('fiscal committee')
                if version.local_program == 'Yes':
                    tags.append('local program')
                if version.urgency == 'Yes':
                    tags.append('urgency')
                if version.taxlevy == 'Yes':
                    tags.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note='summary')
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras['impact_clause'] = impact_clause
            fsbill.extras['tags'] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == 'Y',
                    entity_type='person',
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {
                        'Assembly': 'lower',
                        'Senate': 'upper'
                    }[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:

                    def replacer(matchobj):
                        if matchobj:
                            return {
                                'Assembly': 'lower',
                                'Senate': 'upper'
                            }[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r'^(Assembly|Senate)', replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r'Com[s]?. on',
                             action.action) and not matched_abbrs:
                    msg = 'Failed to extract committee abbr from %r.'
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ('Mapping contains no committee name for '
                                   'abbreviation %r. Action text was %r.')
                            args = (abbr, action.action)
                            raise KeyError(msg % args)

                    committees = filter(None, committees)
                    kwargs['committees'] = committees

                    code = re.search(r'C[SXZ]\d+', actor)
                    if code is not None:
                        code = code.group()
                        kwargs['actor_info'] = {'committee_code': code}

                    assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace('Coms. on ', '')
                        act_str = act_str.replace('Com. on ' + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith('.'):
                            act_str = act_str + '.'

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ['upper', 'lower', 'legislature']:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = 'legislature'

                if actor != action.actor:
                    actor_info = kwargs.get('actor_info', {})
                    actor_info['details'] = action.actor
                    kwargs['actor_info'] = actor_info

                # Add strings for related legislators, if any.
                rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+'
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs['legislators'] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(
                    act_str,
                    date.strftime('%Y-%m-%d'),
                    chamber=actor,
                    classification=kwargs['classification'])
                for committee in kwargs.get('committees', []):
                    action.add_related_entity(committee,
                                              entity_type='organization')
                seen_actions.add((actor, act_str, date))

            for vote_num, vote in enumerate(bill.votes):
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                if not vote.location:
                    continue

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                if vote.motion:
                    motion = vote.motion.motion_text or ''
                else:
                    motion = ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ', '',
                                motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '',
                                motion)
                motion = re.sub(
                    r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                    r'Urgency Clause$', '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                # XXX this is responsible for all the CA 'committee' votes, not
                # sure if that's a feature or bug, so I'm leaving it as is...
                # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                # org = {
                # 'name': vote_location,
                # 'classification': vote_classification
                # }

                fsvote = VoteEvent(
                    motion_text=motion,
                    start_date=self._tz.localize(vote.vote_date_time),
                    result='pass' if result else 'fail',
                    classification=vtype,
                    # organization=org,
                    chamber=vote_chamber,
                    bill=fsbill,
                )
                fsvote.extras = {'threshold': vote.threshold}

                source_url = ('http://leginfo.legislature.ca.gov/faces'
                              '/billVotesClient.xhtml?bill_id={}').format(
                                  fsbill.identifier)
                fsvote.add_source(source_url)
                fsvote.pupa_id = source_url + '#' + str(vote_num)

                rc = {'yes': [], 'no': [], 'other': []}
                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        rc['yes'].append(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        rc['no'].append(record.legislator_name)
                    else:
                        rc['other'].append(record.legislator_name)

                # Handle duplicate votes
                for key in rc.keys():
                    rc[key] = list(set(rc[key]))

                for key, voters in rc.items():
                    for voter in voters:
                        fsvote.vote(key, voter)
                    # Set counts by summed votes for accuracy
                    fsvote.set_count(key, len(voters))

                yield fsvote

            yield fsbill
            self.session.expire_all()
示例#27
0
	def scrape_bill(self, bill_url, bill_id, session_id):
		page = self.lxmlize(bill_url)
		# create bill
		title = page.xpath("//h1/text()")[0]
		bill = Bill(identifier=bill_id,
			        legislative_session=session_id,
			        title=title)
		bill.add_source(bill_url, note="detail")

		# add additional fields

		# abstract
		try:
			# abstract is directly above <h2>Legislative History</h2>
			leg_his = page.xpath("//h2[text()='Legislative History']")[0]
			abstract = leg_his.xpath("preceding-sibling::p/text()")[0]
			bill.add_abstract(abstract=abstract.strip(), note="summary")
			# TODO trim whitespace from summary
		except IndexError:
			print("No abstract for bill {} in session {}".format(bill_id, session_id))

		# the rest of the fields are found inside this <table>
		data_table = page.xpath("//table[contains(@class, 'data')]")[0]

		# sponsor
		sponsor_name = data_table.xpath(self.bill_table_query("Sponsor") + "/text()")[0]
		bill.add_sponsorship(name=sponsor_name,
				classification="Primary",
				entity_type="person",
				primary=True
				)

		# actions
		action_lines = data_table.xpath(self.bill_table_query("Actions") + "/text()")
		for line in action_lines:
			line = line.join('')
			try:
				for date_str, action_type in self.parse_actions(line):
					bill.add_action(date=date_str,
						description=action_type,	
						classification=action_type)
			except ValueError:
				print("failed to parse these actions: {}".format([line]))


		# co-sponsors
		co_sponsors = data_table.xpath(self.bill_table_query("Co-Sponsors") + "/text()")
		co_sponsors = [name.strip() for name in co_sponsors if name.strip()]
		for name in co_sponsors:
			bill.add_sponsorship(name=name,
						classification="co-sponsor",
						entity_type="person",
						primary=False)

		# committee (stored as another sponsorship in OCD)
		committees = data_table.xpath(self.bill_table_query("Committee") + "/a/text()")
		for comm in committees:
			bill.add_sponsorship(name=comm,
							classification="secondary", # classification ?
							entity_type="organization",
							primary=False)

		return bill
示例#28
0
    def scrape_bill(self, session, chamber, bill_id, title, url,
                    strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub):

        html = self.get(url).text

        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        bill_type = self.bill_types[bill_id.split()[0][1:]]

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)
        bill.add_source(url)

        xpath = ('//strong[contains(., "SUBJECT")]/../'
                 'following-sibling::td/a/text()')
        bill.subject = page.xpath(xpath)

        for version in self.scrape_versions(session, chamber, page, bill_id):
            bill.add_version_link(**version)

        # Resolution pages have different html.
        values = {}
        trs = page.xpath('//div[@id="bhistcontent"]/table/tr')
        for tr in trs:
            heading = tr.xpath('td/strong/text()')
            if heading:
                heading = heading[0]
            else:
                continue
            value = tr.text_content().replace(heading, '').strip()
            values[heading] = value

        # summary was always same as title
        # bill['summary'] = values['SUMMARY:']

        # Add primary sponsor.
        primary = strip_sponsors('', values.get('LEAD SPONSOR:', ''))
        if primary:
            bill.add_sponsorship(
                name=primary,
                classification='primary',
                entity_type='person',
                primary=True
            )

        # Add cosponsors.
        if values.get('SPONSORS:'):
            sponsors = strip_sponsors('', values['SPONSORS:'])
            sponsors = re.split(', (?![A-Z]\.)', sponsors)
            for name in sponsors:
                name = name.strip(', \n\r')
                if name:
                    # Fix name splitting bug where "Neale, D. Hall"
                    match = re.search('(.+?), ([DM]\. Hall)', name)
                    if match:
                        for name in match.groups():
                            bill.add_sponsorship(
                                name=name,
                                classification='cosponsor',
                                entity_type='person',
                                primary=False
                            )
                    else:
                        bill.add_sponsorship(
                            name=name,
                            classification='cosponsor',
                            entity_type='person',
                            primary=False
                        )

        for link in page.xpath("//a[contains(@href, 'votes/house')]"):
            yield from self.scrape_house_vote(bill, link.attrib['href'])

        for tr in reversed(page.xpath("//table[@class='tabborder']/descendant::tr")[1:]):
            tds = tr.xpath('td')
            if len(tds) < 3:
                continue

            chamber_letter = tds[0].text_content()
            chamber = {'S': 'upper', 'H': 'lower'}[chamber_letter]

            # Index of date info no longer varies on resolutions.
            date = tds[2].text_content().strip()
            date = datetime.datetime.strptime(date, "%m/%d/%y").date()

            action = tds[1].text_content().strip()
            if action.lower().startswith('passed senate'):
                for href in tds[1].xpath('a/@href'):
                    yield from self.scrape_senate_vote(bill, href, date)

            attrs = dict(chamber=chamber, description=action, date=date.strftime("%Y-%m-%d"))
            temp = self.categorizer.categorize(action)
            related_entities = []
            for key, values in temp.items():
                if key != 'classification':
                    for value in values:
                        related_entities.append({
                            "type": key,
                            "name": value
                        })
            attrs.update(classification=temp['classification'], related_entities=related_entities)
            bill.add_action(**attrs)

        yield bill
示例#29
0
    def parse_bill_status_page(self, status_url, bill_url, session, chamber):
        status_page = lxml.html.fromstring(self.get(status_url).text)
        # see 2007 HB 2... weird.
        bill_re = r'.*?/([A-Z]+)0*(\d+)\.pdf'
        bill_xpath = '//a[contains(@href, ".pdf") and contains(@href, "billpdf")]/@href'
        bill_id = re.search(bill_re, status_page.xpath(bill_xpath)[0],
                            re.IGNORECASE).groups()
        bill_id = "{0} {1}".format(bill_id[0], int(bill_id[1]))

        try:
            xp = '//b[text()="Short Title:"]/../following-sibling::td/text()'
            title = status_page.xpath(xp).pop()
        except IndexError:
            title = status_page.xpath('//tr[1]/td[2]')[0].text_content()

        # Add bill type.
        _bill_id = bill_id.lower()
        if 'b' in _bill_id:
            classification = 'bill'
        elif 'j' in _bill_id or 'jr' in _bill_id:
            classification = 'joint resolution'
        elif 'cr' in _bill_id:
            classification = 'concurrent resolution'
        elif 'r' in _bill_id:
            classification = 'resolution'

        bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                    title=title, classification=classification)

        self.add_actions(bill, status_page)
        votes = self.add_votes(bill, status_page, status_url)

        tabledata = self._get_tabledata(status_page)

        # Add sponsor info.
        bill.add_sponsorship(tabledata['primary sponsor:'][0], classification='primary',
                             entity_type='person', primary=True)

        # A various plus fields MT provides.
        plus_fields = [
            'requester',
            ('chapter number:', 'chapter'),
            'transmittal date:',
            'drafter',
            'fiscal note probable:',
            'bill draft number:',
            'preintroduction required:',
            'by request of',
            'category:']

        for x in plus_fields:
            if isinstance(x, tuple):
                _key, key = x
            else:
                _key = key = x
                key = key.replace(' ', '_')

            try:
                val = tabledata[_key]
            except KeyError:
                continue

            if len(val) == 1:
                val = val[0]

            bill.extras[key] = val

        # Add bill subjects.
        xp = '//th[contains(., "Revenue/Approp.")]/ancestor::table/tr'
        subjects = []
        for tr in status_page.xpath(xp):
            try:
                subj = tr.xpath('td')[0].text_content()
            except:
                continue
            subjects.append(subj)

        for s in subjects:
            bill.add_subject(s)

        self.add_fiscal_notes(status_page, bill)

        return bill, list(votes)
示例#30
0
    def scrape_chamber(self, chamber, session):
        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        bill_request = self.get(ksapi.url + 'bill_status/').text
        bill_request_json = json.loads(bill_request)
        bills = bill_request_json['content']
        for bill_data in bills:

            bill_id = bill_data['BILLNO']

            # filter other chambers
            if not bill_id.startswith(chamber_letter):
                continue

            if 'CR' in bill_id:
                btype = 'concurrent resolution'
            elif 'R' in bill_id:
                btype = 'resolution'
            elif 'B' in bill_id:
                btype = 'bill'

            title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE']

            # main
            bill = Bill(
                bill_id,
                session,
                title,
                chamber=chamber,
                classification=btype,
            )
            bill.extras = {'status': bill_data['STATUS']}

            bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

            if (bill_data['LONGTITLE'] and
                    bill_data['LONGTITLE'] != bill.title):
                bill.add_title(bill_data['LONGTITLE'])

            # An "original sponsor" is the API's expression of "primary sponsor"
            for primary_sponsor in bill_data['ORIGINAL_SPONSOR']:
                bill.add_sponsorship(
                    name=primary_sponsor,
                    entity_type='organization' if "committee" in primary_sponsor.lower()
                                else 'person',
                    primary=True,
                    classification="original sponsor"
                )
            for sponsor in bill_data['SPONSOR_NAMES']:
                if sponsor in bill_data['ORIGINAL_SPONSOR']:
                    continue
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type='organization' if "committee" in sponsor.lower() else 'person',
                    primary=False,
                    classification='cosponsor',
                )

            # history is backwards
            for event in reversed(bill_data['HISTORY']):
                actor = ('upper' if event['chamber'] == 'Senate'
                         else 'lower')

                date = event['session_date']
                # append committee names if present
                if 'committee_names' in event:
                    action = (event['status'] + ' ' +
                              ' and '.join(event['committee_names']))
                else:
                    action = event['status']

                if event['action_code'] not in ksapi.action_codes:
                    self.warning('unknown action code on %s: %s %s' %
                                 (bill_id, event['action_code'],
                                  event['status']))
                    atype = None
                else:
                    atype = ksapi.action_codes[event['action_code']]
                bill.add_action(
                    action, date, chamber=actor, classification=atype)

            # Versions are exposed in `bill_data['versions'],
            # but lack any descriptive text or identifiers;
            # continue to scrape these from the HTML
            yield from self.scrape_html(bill, session)

            yield bill
示例#31
0
    def scrape_bill(self, bill_num, session):
        chamber_map = {'House': 'lower', 'Senate': 'upper', 'LSO': 'executive'}
        # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45
        bill_json_url = 'http://wyoleg.gov/LsoService/api/BillInformation/{}/' \
                        '{}?calendarDate='.format(
                            session, bill_num)
        response = self.get(bill_json_url)
        bill_json = json.loads(response.content.decode('utf-8'))

        chamber = 'lower' if bill_json['bill'][0] else 'upper'

        bill = Bill(identifier=bill_json['bill'],
                    legislative_session=session,
                    title=bill_json['catchTitle'],
                    chamber=chamber,
                    classification="bill",
                    )

        bill.add_title(bill_json['billTitle'])

        source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format(session,
                                                                      bill_json['bill'])
        bill.add_source(source_url)

        for action_json in bill_json['billActions']:
            utc_action_date = self.parse_local_date(action_json['statusDate'])

            actor = None
            if action_json['location'] and action_json['location'] in chamber_map:
                actor = chamber_map[action_json['location']]

            action = bill.add_action(
                chamber=actor,
                description=action_json['statusMessage'],
                date=utc_action_date,
                classification=categorize_action(action_json['statusMessage']),
            )

            action.extras = {
                'billInformationID': action_json['billInformationID']}

        if bill_json['introduced']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['introduced'])

            bill.add_version_link(note="Introduced",
                                  url=url,
                                  media_type="application/pdf"  # optional but useful!
                                  )

        if bill_json['enrolledAct']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['enrolledAct'])

            bill.add_version_link(note="Enrolled",
                                  url=url,
                                  media_type="application/pdf"  # optional but useful!
                                  )

        if bill_json['fiscalNote']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['fiscalNote'])

            bill.add_document_link(note="Fiscal Note",
                                   url=url,
                                   media_type="application/pdf"  # optional but useful!
                                   )

        if bill_json['digest']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['digest'])

            bill.add_document_link(note="Bill Digest",
                                   url=url,
                                   media_type="application/pdf"  # optional but useful!
                                   )

        if bill_json['vetoes']:
            for veto in bill_json['vetoes']:
                url = 'http://wyoleg.gov/{}'.format(veto['vetoLinkPath'])
                bill.add_version_link(note=veto['vetoLinkText'],
                                      url=url,
                                      media_type="application/pdf"  # optional but useful!
                                      )

        for amendment in bill_json['amendments']:
            # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf
            url = 'http://wyoleg.gov/{}/Amends/{}.pdf'.format(
                session, amendment['amendmentNumber'])

            if amendment['sponsor'] and amendment['status']:
                title = 'Amendment {} ({}) - {} ({})'.format(
                    amendment['amendmentNumber'],
                    amendment['order'],
                    amendment['sponsor'],
                    amendment['status'],
                )
            else:
                title = 'Amendment {} ({})'.format(
                    amendment['amendmentNumber'],
                    amendment['order'],
                )
            # add versions of the bill text
            version = bill.add_version_link(
                note=title,
                url=url,
                media_type="application/pdf",
            )
            version['extras'] = {
                'amendmentNumber': amendment['amendmentNumber'],
                'sponsor': amendment['sponsor'],
            }

        for sponsor in bill_json['sponsors']:
            status = 'primary' if sponsor['primarySponsor'] else 'cosponsor'
            sponsor_type = 'person' if sponsor['sponsorTitle'] else 'organization'
            bill.add_sponsorship(
                name=sponsor['name'],
                classification=status,
                entity_type=sponsor_type,
                primary=sponsor['primarySponsor']
            )

        if bill_json['summary']:
            bill.add_abstract(
                note="summary",
                abstract=bill_json['summary'],
            )

        if bill_json['enrolledNumber']:
            bill.extras['wy_enrolled_number'] = bill_json['enrolledNumber']

        if bill_json['chapter']:
            bill.extras['chapter'] = bill_json['chapter']

        if bill_json['effectiveDate']:
            eff = datetime.datetime.strptime(
                bill_json['effectiveDate'], '%m/%d/%Y')
            bill.extras['effective_date'] = eff.strftime('%Y-%m-%d')

        bill.extras['wy_bill_id'] = bill_json['id']

        for vote_json in bill_json['rollCalls']:
            yield from self.scrape_vote(bill, vote_json, session)

        yield bill
示例#32
0
    def scrape_bill_list(self, url):
        bill_list = self._get_bill_list(url)

        for bill_info in bill_list:

            (bill_id, ) = bill_info.xpath('td[1]/font/input/@value')
            (sponsor, ) = bill_info.xpath('td[2]/font/input/@value')
            (subject, ) = bill_info.xpath('td[3]//text()')
            subject = subject.strip()
            chamber = self.CHAMBERS[bill_id[0]]

            if 'B' in bill_id:
                bill_type = 'bill'
            elif 'JR' in bill_id:
                bill_type = 'joint resolution'
            elif 'R' in bill_id:
                bill_type = 'resolution'
            else:
                raise AssertionError(
                    "Unknown bill type for bill '{}'".format(bill_id))

            bill = Bill(
                bill_id,
                legislative_session=self.session,
                chamber=chamber,
                title='',
                classification=bill_type,
            )
            if subject:
                bill.subject = [subject]
            if sponsor:
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )
            bill.add_source(url)

            bill_url = ('http://alisondb.legislature.state.al.us/Alison/'
                        'SESSBillStatusResult.aspx?BILL={}'.format(bill_id))
            bill.add_source(bill_url)

            bill_html = self._get_bill_response(bill_url)
            if bill_html is None:
                self.warning("Bill {} has no webpage, and will be skipped".
                             format(bill_id))
                continue
            bill_doc = lxml.html.fromstring(bill_html)

            if (bill_doc.xpath('//span[@id="ContentPlaceHolder1_lblShotTitle"]')):
                title = bill_doc.xpath(
                    '//span[@id="ContentPlaceHolder1_lblShotTitle"]'
                )[0].text_content().strip()
            if not title:
                title = "[No title given by state]"
            bill.title = title

            version_url_base = (
                'http://alisondb.legislature.state.al.us/ALISON/'
                'SearchableInstruments/{0}/PrintFiles/{1}-'.
                format(self.session, bill_id))
            versions = bill_doc.xpath(
                '//table[@class="box_versions"]/tr/td[2]/font/text()')
            for version in versions:
                name = version
                if version == "Introduced":
                    version_url = version_url_base + 'int.pdf'
                elif version == "Engrossed":
                    version_url = version_url_base + 'eng.pdf'
                elif version == "Enrolled":
                    version_url = version_url_base + 'enr.pdf'
                else:
                    raise NotImplementedError(
                        "Unknown version type found: '{}'".format(name))

                bill.add_version_link(
                    name,
                    version_url,
                    media_type='application/pdf',
                    on_duplicate='ignore',
                )

            # Fiscal notes exist, but I can't figure out how to build their URL
            fiscal_notes = bill_doc.xpath(
                '//table[@class="box_fiscalnote"]')[1:]
            for fiscal_note in fiscal_notes:
                pass

            # Budget Isolation Resolutions are handled as extra actions/votes
            birs = bill_doc.xpath(
                '//div[@class="box_bir"]//table//table/tr')[1:]
            for bir in birs:
                bir_action = bir.xpath('td[1]')[0].text_content().strip()
                # Sometimes ALISON's database puts another bill's
                # actions into the BIR action list; ignore these
                if bill_id not in bir_action:
                    self.warning(
                        "BIR action found ({}) ".format(bir_action) +
                        "that doesn't match the bill ID ({})".format(bill_id))
                    continue

                bir_date = datetime.datetime.strptime(
                    bir.xpath('td[2]/font/text()')[0], self.DATE_FORMAT)
                bir_type = bir.xpath('td[1]/font/text()')[0].split(" ")[0]
                bir_chamber = self.CHAMBERS[bir_type[0]]
                bir_text = "{0}: {1}".format(
                    bir_type, bir.xpath('td[3]/font/text()')[0].strip())

                bill.add_action(
                    bir_text,
                    TIMEZONE.localize(bir_date),
                    chamber=bir_chamber,
                    classification='other',
                )

                try:
                    (bir_vote_id, ) = bir.xpath('td[4]/font/input/@value')
                except ValueError:
                    bir_vote_id = ''

                bir_vote_id = bir_vote_id.strip()
                if bir_vote_id.startswith("Roll "):
                    bir_vote_id = bir_vote_id.split(" ")[-1]

                    yield from self.scrape_vote(
                        bill=bill,
                        vote_chamber=bir_type[0],
                        bill_id="{0}%20for%20{1}".format(bir_type, bill_id),
                        vote_id=bir_vote_id,
                        vote_date=TIMEZONE.localize(bir_date),
                        action_text=bir_text
                    )

            actions = bill_doc.xpath('//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:]
            action_date = None
            for action in actions:
                # If actions occur on the same day, only one date will exist
                if (action.xpath('td[1]/font/text()')[0].
                        encode('ascii', 'ignore').strip()):
                    action_date = datetime.datetime.strptime(
                        action.xpath('td[1]/font/text()')[0], self.DATE_FORMAT)

                (action_chamber, ) = action.xpath('td[2]/font/text()')

                if action.xpath('td[3]/font/u/text()'):
                    (amendment, ) = action.xpath('td[3]/font/u/text()')
                else:
                    amendment = None

                (action_text, ) = action.xpath('td[4]/font/text()')

                action_type = _categorize_action(action_text)

                # check for occasional extra last row
                if not action_chamber.strip():
                    continue

                # The committee cell is just an abbreviation, so get its name
                actor = self.CHAMBERS[action_chamber]
                try:
                    action_committee = re.search(
                        r'.*? referred to the .*? committee on (.*?)$',
                        action_text).group(1).strip()
                except AttributeError:
                    action_committee = ''

                act = bill.add_action(
                    action_text,
                    TIMEZONE.localize(action_date),
                    chamber=actor,
                    classification=action_type,
                )
                if action_committee:
                    act.add_related_entity(action_committee, entity_type='organization')

                try:
                    vote_button = action.xpath('td[9]//text()')[0].strip()
                except IndexError:
                    vote_button = ''

                if vote_button.startswith("Roll "):
                    vote_id = vote_button.split(" ")[-1]

                    yield from self.scrape_vote(
                        bill=bill,
                        vote_chamber=action_chamber,
                        bill_id=bill_id,
                        vote_id=vote_id,
                        vote_date=TIMEZONE.localize(action_date),
                        action_text=action_text
                    )

                if amendment:
                    amend_url = (
                        'http://alisondb.legislature.state.al.us/ALISON/'
                        'SearchableInstruments/{0}/PrintFiles/{1}.pdf'.
                        format(self.session, amendment))

                    amend_name = 'Amd/Sub {}'.format(amendment)

                    bill.add_version_link(
                        amend_name,
                        amend_url,
                        media_type='application/pdf',
                        on_duplicate='ignore',
                    )

            yield bill
示例#33
0
    def scrape_bills(self, session, year_abr):
        # Main Bill information
        main_bill_csv = self.access_to_csv('MainBill')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["Synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(
                bill_id,
                title=title,
                chamber=chamber,
                legislative_session=session,
                classification=self._bill_types[bill_type[1:]],
            )
            if rec['IdenticalBillNumber'].strip():
                bill.add_related_bill(
                    rec['IdenticalBillNumber'].split()[0],
                    legislative_session=session,
                    relation_type='companion',
                )

            # TODO: last session info is in there too
            bill_dict[bill_id] = bill

        # Sponsors
        bill_sponsors_csv = self.access_to_csv('BillSpon')

        for rec in bill_sponsors_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in sponsor database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            name = rec["Sponsor"]
            sponsor_type = rec["Type"]
            if sponsor_type == 'P':
                sponsor_type = "primary"
            else:
                sponsor_type = "cosponsor"
            bill.add_sponsorship(name, classification=sponsor_type, entity_type='person',
                                 primary=sponsor_type == 'primary')

        # Documents
        bill_document_csv = self.access_to_csv('BillWP')

        for rec in bill_document_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in document database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["Document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]

            # doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/{}/Bills/{}'.format(
                year_abr,
                document.replace('.DOC', '.HTM'),
            )

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec['DocType']]
            except KeyError:
                raise Exception('unknown doctype %s on %s' %
                                (rec['DocType'], bill_id))
            if rec['Comment']:
                doc_name += ' ' + rec['Comment']

            # Clean HTMX links.
            if htm_url.endswith('HTMX'):
                htm_url = re.sub('X$', '', htm_url)

            if rec['DocType'] in self._version_types:
                if htm_url.endswith('HTM'):
                    mimetype = 'text/html'
                elif htm_url.endswith('wpd'):
                    mimetype = 'application/vnd.wordperfect'
                try:
                    bill.add_version_link(doc_name, htm_url, media_type=mimetype)
                except ValueError:
                    self.warning("Couldn't find a document for bill {}".format(bill_id))
                    pass
            else:
                bill.add_document_link(doc_name, htm_url)

        # Votes
        next_year = int(year_abr) + 1
        vote_info_list = [
            'A%s' % year_abr,
            'A%s' % next_year,
            'S%s' % year_abr,
            'S%s' % next_year,
            'CA%s-%s' % (year_abr, next_year),
            'CS%s-%s' % (year_abr, next_year),
        ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.FTPError:
                self.warning('could not find %s' % s_vote_url)
                continue
            zippedfile = zipfile.ZipFile(s_vote_zip)
            for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]:
                try:
                    vote_file = io.TextIOWrapper(zippedfile.open(vfile, 'rU'))
                except KeyError:
                    #
                    # Right, so, 2011 we have an "End" file with more
                    # vote data than was in the original dump.
                    #
                    self.warning("No such file: %s" % (vfile))
                    continue

                vdict_file = csv.DictReader(vote_file)

                votes = {}
                if filename.startswith('A') or filename.startswith('CA'):
                    chamber = "lower"
                else:
                    chamber = "upper"

                if filename.startswith('C'):
                    vote_file_type = 'committee'
                else:
                    vote_file_type = 'chamber'

                for rec in vdict_file:
                    if vote_file_type == 'chamber':
                        bill_id = rec["Bill"].strip()
                        leg = rec["Full_Name"]

                        date = rec["Session_Date"]
                        action = rec["Action"]
                        leg_vote = rec["Legislator_Vote"]
                        vote_parts = (bill_id, chamber, action)
                    else:
                        bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number'])
                        leg = rec['Name']
                        # drop time portion
                        date = rec['Agenda_Date'].split()[0]
                        # make motion readable
                        action = self._com_vote_motions[rec['BillAction']]
                        # first char (Y/N) use [0:1] to ignore ''
                        leg_vote = rec['LegislatorVote'][0:1]
                        committee = rec['Committee_House']
                        vote_parts = (bill_id, chamber, action, committee)

                    date = datetime.strptime(date, "%m/%d/%Y")
                    vote_id = '_'.join(vote_parts).replace(' ', '_')

                    if bill_id[0] == 'A':
                        b_chamber = "lower"
                    else:
                        b_chamber = "upper"

                    if vote_id not in votes:
                        votes[vote_id] = VoteEvent(
                            start_date=TIMEZONE.localize(date),
                            chamber=chamber,
                            motion_text=action,
                            classification='passage',
                            result=None,
                            bill=bill_id,
                            bill_chamber=b_chamber,
                            legislative_session=session,
                        )
                    if leg_vote == "Y":
                        votes[vote_id].vote('yes', leg)
                    elif leg_vote == "N":
                        votes[vote_id].vote('no', leg)
                    else:
                        votes[vote_id].vote('other', leg)

            # remove temp file
            os.remove(s_vote_zip)

            # Counts yes/no/other votes and saves overall vote
            for vote in votes.values():
                counts = collections.defaultdict(int)
                for count in vote.votes:
                    counts[count['option']] += 1
                vote.set_count('yes', counts['yes'])
                vote.set_count('no', counts['no'])
                vote.set_count('other', counts['other'])

                # Veto override.
                if vote.motion_text == 'OVERRIDE':
                    # Per the NJ leg's glossary, a veto override requires
                    # 2/3ds of each chamber. 27 in the senate, 54 in the house.
                    # http://www.njleg.state.nj.us/legislativepub/glossary.asp
                    if vote.chamber == 'lower':
                        vote.result = 'pass' if counts['yes'] >= 54 else 'fail'
                    elif vote['chamber'] == 'upper':
                        vote.result = 'pass' if counts['yes'] >= 27 else 'fail'
                else:
                    # Regular vote.
                    vote.result = 'pass' if counts['yes'] > counts['no'] else 'fail'

                vote.add_source('http://www.njleg.state.nj.us/downloads.asp')
                yield vote

        # Actions
        bill_action_csv = self.access_to_csv('BillHist')
        actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'}

        for rec in bill_action_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in action database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            action = rec["Action"]
            date = rec["DateAction"]
            date = datetime.strptime(date, "%m/%d/%y %H:%M:%S")
            actor = actor_map[rec["House"]]
            comment = rec["Comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += (' ' + comment)
            bill.add_action(
                action,
                date=TIMEZONE.localize(date),
                classification=atype,
                chamber=actor,
            )

        # Subjects
        subject_csv = self.access_to_csv('BillSubj')
        for rec in subject_csv:
            bill_id = rec['BillType'].strip() + str(int(rec['BillNumber']))
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in subject database' % bill_id)
                continue
            bill = bill_dict.get(bill_id)
            if bill:
                bill.subject.append(rec['SubjectKey'])
            else:
                self.warning('invalid bill id in BillSubj: %s' % bill_id)

        phony_bill_count = 0
        # save all bills at the end
        for bill in bill_dict.values():
            # add sources
            if not bill.actions and not bill.versions:
                self.warning('probable phony bill detected %s',
                             bill.identifier)
                phony_bill_count += 1
            else:
                bill.add_source('http://www.njleg.state.nj.us/downloads.asp')
                yield bill

        if phony_bill_count:
            self.warning('%s total phony bills detected', phony_bill_count)
示例#34
0
    def _scrape_bill(self, session, bill_data):
        details = self._parse_bill_details(bill_data)

        (senate_url, assembly_url, bill_chamber, bill_type, bill_id,
         title, (prefix, number, active_version)) = details

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=bill_chamber,
            title=title or bill_data['summary'],
            classification=bill_type,
        )

        if bill_data['summary']:
            bill.add_abstract(bill_data['summary'], note='')

        bill_active_version = bill_data['amendments']['items'][active_version]

        # Parse sponsors.
        if bill_data['sponsor'] is not None:
            if bill_data['sponsor']['rules'] is True:
                bill.add_sponsorship(
                    'Rules Committee',
                    entity_type='organization',
                    classification='primary',
                    primary=True,
                )
            elif not bill_data['sponsor']['budget']:
                primary_sponsor = bill_data['sponsor']['member']
                bill.add_sponsorship(
                    primary_sponsor['shortName'],
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

                # There *shouldn't* be cosponsors if there is no sponsor.
                cosponsors = bill_active_version['coSponsors']['items']
                for cosponsor in cosponsors:
                    bill.add_sponsorship(
                        cosponsor['shortName'],
                        entity_type='person',
                        classification='cosponsor',
                        primary=False,
                    )

        # List companion bill.
        same_as = bill_active_version.get('sameAs', {})
        # Check whether "sameAs" property is populated with at least one bill.
        if same_as['items']:
            # Get companion bill ID.
            companion_bill_id = same_as['items'][0]['basePrintNo']

            # Build companion bill session.
            start_year = same_as['items'][0]['session']
            end_year = start_year + 1
            companion_bill_session = '-'.join([str(start_year), str(end_year)])

            # Attach companion bill data.
            bill.add_related_bill(
                companion_bill_id,
                companion_bill_session,
                relation_type='companion',
            )

        # Parse actions.
        chamber_map = {
            'senate': 'upper',
            'assembly': 'lower',
        }

        for action in bill_data['actions']['items']:
            chamber = chamber_map[action['chamber'].lower()]
            action_datetime = datetime.datetime.strptime(action['date'], '%Y-%m-%d')
            action_date = action_datetime.date()
            types, _ = NYBillScraper.categorizer.categorize(action['text'])

            bill.add_action(
                action['text'],
                action_date.strftime('%Y-%m-%d'),
                chamber=chamber,
                classification=types,
            )

        # Handling of sources follows. Sources serving either chamber
        # maintain duplicate data, so we can see certain bill data
        # through either chamber's resources. However, we have to refer
        # to a specific chamber's resources if we want to grab certain
        # specific information such as vote data.
        #
        # As such, I'm placing all potential sources in the interest of
        # thoroughness. - Andy Lo

        # List Open Legislation API endpoint as a source.
        api_url = self.api_client.root + self.api_client.resources['bill'].format(
            session_year=session,
            bill_id=bill_id,
            summary='',
            detail='')
        bill.add_source(api_url)
        bill.add_source(senate_url)
        bill.add_source(assembly_url)

        # Chamber-specific processing.
        if bill_chamber == 'upper':
            # Collect votes.
            for vote_data in bill_data['votes']['items']:
                yield self._parse_senate_votes(vote_data, bill, api_url)
        elif bill_chamber == 'lower':
            assembly = AssemblyBillPage(self, session, bill, details)
            assembly.build()

        # A little strange the way it works out, but the Assembly
        # provides the HTML version documents and the Senate provides
        # the PDF version documents.
        amendments = bill_data['amendments']['items']
        for key, amendment in amendments.items():
            version = amendment['printNo']

            html_version = version + ' HTML'
            html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\
                '{}&term={}'.format(bill_id, self.term_start_year)
            bill.add_version_link(
                html_version,
                html_url,
                on_duplicate='ignore',
                media_type='text/html',
            )

            pdf_version = version + ' PDF'
            pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\
                .format(self.term_start_year, bill_id)
            bill.add_version_link(
                pdf_version,
                pdf_url,
                on_duplicate='ignore',
                media_type='application/pdf',
            )

        yield bill
示例#35
0
    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.lxmlize(url)

        if re.match(r'^(S|H)B ', bill_id):
            btype = ['bill']
        elif re.match(r'(S|H)C ', bill_id):
            btype = ['commemoration']
        elif re.match(r'(S|H)JR ', bill_id):
            btype = ['joint resolution']
        elif re.match(r'(S|H)CR ', bill_id):
            btype = ['concurrent resolution']
        else:
            btype = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=btype)
        bill.add_source(url)

        regex_ns = "http://exslt.org/regular-expressions"
        version_links = page.xpath(
            r"//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]",
            namespaces={'re': regex_ns})
        for link in version_links:
            bill.add_version_link(link.xpath('string()').strip(),
                                  link.attrib['href'],
                                  media_type='text/html',
                                  on_duplicate='ignore')

        sponsor_links = page.xpath(
            '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]' +
            '/label[contains(text(), "Sponsors:")]' +
            '/following-sibling::div[1]/p/a')
        for link in sponsor_links:
            if link.attrib['href'].startswith(
                    'https://sdlegislature.gov/Legislators/'):
                sponsor_type = 'person'
            elif link.attrib['href'].startswith(
                    'https://sdlegislature.gov/Legislative_Session/Committees'
            ):
                sponsor_type = 'organization'
            else:
                raise ScrapeError('Found unexpected sponsor, URL: ' +
                                  link.attrib['href'])
            bill.add_sponsorship(link.text,
                                 classification='primary',
                                 primary=True,
                                 entity_type=sponsor_type)

        actor = chamber
        use_row = False

        for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"):
            # Some tables have null rows, that are just `<tr></tr>`
            # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018
            if row.text_content() == '':
                self.debug(
                    'Skipping action table row that is completely empty')
                continue

            if 'Date' in row.text_content() and 'Action' in row.text_content():
                use_row = True
                continue
            elif not use_row:
                continue

            action = row.xpath("string(td[2])").strip()

            atypes = []
            if action.startswith('First read'):
                atypes.append('introduction')
                atypes.append('reading-1')

            if re.match(r'Signed by (?:the\s)*Governor', action,
                        re.IGNORECASE):
                atypes.append('executive-signature')
                actor = 'executive'

            match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)',
                             action)
            if match:
                if match.group(1) in ['Senate', 'House of Representatives']:
                    first = ''
                else:
                    first = 'committee-'
                if match.group(3).lower() == 'passed':
                    second = 'passage'
                elif match.group(3).lower() == 'failed':
                    second = 'failure'
                atypes.append("%s%s" % (first, second))

            if 'referred to' in action.lower():
                atypes.append('referral-committee')

            if 'Motion to amend, Passed Amendment' in action:
                atypes.append('amendment-introduction')
                atypes.append('amendment-passage')

            if 'Veto override, Passed' in action:
                atypes.append('veto-override-passage')
            elif 'Veto override, Failed' in action:
                atypes.append('veto-override-failure')

            if 'Delivered to the Governor' in action:
                atypes.append('executive-receipt')

            match = re.match("First read in (Senate|House)", action)
            if match:
                if match.group(1) == 'Senate':
                    actor = 'upper'
                else:
                    actor = 'lower'

            date = row.xpath("string(td[1])").strip()
            match = re.match(r'\d{2}/\d{2}/\d{4}', date)
            if not match:
                self.warning("Bad date: %s" % date)
                continue
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"):
                yield from self.scrape_vote(bill, date, link.attrib['href'])

            bill.add_action(action, date, chamber=actor, classification=atypes)

        for link in page.xpath("//a[contains(@href, 'Keyword')]"):
            bill.add_subject(link.text.strip())

        yield bill
示例#36
0
    def scrape(self):
        session_name = self.latest_session()
        session = session_name[0:5]
        self._bill_prefix_map = {
            'HB':  {
                'type': 'bill',
                'url_segment': 'bills/house',
            },
            'HR':  {
                'type': 'resolution',
                'url_segment': 'resolutions/house/simple',
            },
            'HCR': {
                'type': 'concurrent resolution',
                'url_segment': 'resolutions/house/concurrent',
            },
            'HJR': {
                'type': 'joint resolution',
                'url_segment': 'resolutions/house/joint'
            },
            'HC': {
                'type': 'concurrent resolution',
                'url_segment': 'resolutions/house/concurrent',
            },
            'HJ': {
                'type': 'joint resolution',
                'url_segment': 'resolutions/house/joint',
            },
            'SB': {
                'type': 'bill',
                'url_segment': 'bills/senate',
            },
            'SR': {
                'type': 'resolution',
                'url_segment': 'resolutions/senate/simple',
            },
            'SCR': {
                'type': 'concurrent resolution',
                'url_segment': 'resolutions/senate/concurrent',
            },
            'SJR': {
                'type': 'joint resolution',
                'url_segment': 'resolutions/senate/joint',
            },
            'SC': {
                'type': 'concurrent resolution',
                'url_segment': 'resolutions/senate/concurrent',
            },
            'SJ': {
                'type': 'joint resolution',
                'url_segment': 'resolutions/senate/joint',
            },
        }

        api_base_url = "https://api.iga.in.gov"
        proxy = {"url": "http://in-proxy.openstates.org"}

        # ah, indiana. it's really, really hard to find
        # pdfs in their web interface. Super easy with
        # the api, but a key needs to be passed
        # in the headers. To make these documents
        # viewable to the public and our scrapers,
        # sunlight's put up a proxy service at this link
        # using our api key for pdf document access.

        client = ApiClient(self)
        r = client.get("bills", session=session)
        all_pages = client.unpaginate(r)
        for b in all_pages:
            bill_id = b["billName"]
            for idx, char in enumerate(bill_id):
                try:
                    int(char)
                except ValueError:
                    continue
                disp_bill_id = bill_id[:idx]+" "+str(int(bill_id[idx:]))
                break

            bill_link = b["link"]
            api_source = api_base_url + bill_link
            try:
                bill_json = client.get("bill", session=session, bill_id=bill_id.lower())
            except scrapelib.HTTPError:
                self.logger.warning('Bill could not be accessed. Skipping.')
                continue

            title = bill_json["title"]
            if title == "NoneNone":
                title = None
            # sometimes title is blank
            # if that's the case, we can check to see if
            # the latest version has a short description
            if not title:
                title = bill_json["latestVersion"]["shortDescription"]

            # and if that doesn't work, use the bill_id but throw a warning
            if not title:
                title = bill_id
                self.logger.warning("Bill is missing a title, using bill id instead.")

            bill_prefix = self._get_bill_id_components(bill_id)[0]

            original_chamber = ("lower" if bill_json["originChamber"].lower() == "house"
                                else "upper")
            bill_type = self._bill_prefix_map[bill_prefix]['type']
            bill = Bill(disp_bill_id,
                        legislative_session=session,
                        chamber=original_chamber,
                        title=title,
                        classification=bill_type)

            bill.add_source(self._get_bill_url(session, bill_id))
            bill.add_source(api_source)

            # sponsors
            for s in bill_json["authors"]:
                bill.add_sponsorship(classification="author",
                                     name=self._get_name(s),
                                     entity_type='person',
                                     primary=True)

            for s in bill_json["coauthors"]:
                bill.add_sponsorship(classification="coauthor",
                                     name=self._get_name(s),
                                     entity_type='person',
                                     primary=False)

            for s in bill_json["sponsors"]:
                bill.add_sponsorship(classification="sponsor",
                                     name=self._get_name(s),
                                     entity_type='person', primary=True)

            for s in bill_json["cosponsors"]:
                bill.add_sponsorship(classification="cosponsor",
                                     name=self._get_name(s),
                                     entity_type='person',
                                     primary=False)

            # actions
            action_link = bill_json["actions"]["link"]
            api_source = api_base_url + action_link

            try:
                actions = client.get("bill_actions", session=session, bill_id=bill_id.lower())
            except scrapelib.HTTPError:
                self.logger.warning("Could not find bill actions page")
                actions = {"items": []}

            for a in actions["items"]:
                action_desc = a["description"]
                if "governor" in action_desc.lower():
                    action_chamber = "executive"
                elif a["chamber"]["name"].lower() == "house":
                    action_chamber = "lower"
                else:
                    action_chamber = "upper"
                date = a["date"]

                if not date:
                    self.logger.warning("Action has no date, skipping")
                    continue

                # convert time to pupa fuzzy time
                date = date.replace('T', ' ')
                # TODO: if we update pupa to accept datetimes we can drop this line
                date = date.split()[0]

                action_type = []
                d = action_desc.lower()
                committee = None

                reading = False
                if "first reading" in d:
                    action_type.append("reading-1")
                    reading = True

                if ("second reading" in d or "reread second time" in d):
                    action_type.append("reading-2")
                    reading = True

                if ("third reading" in d or "reread third time" in d):
                    action_type.append("reading-3")
                    if "passed" in d:
                        action_type.append("passage")
                    if "failed" in d:
                        action_type.append("failure")
                    reading = True

                if "adopted" in d and reading:
                    action_type.append("passage")

                if ("referred" in d and "committee on" in d
                        or "reassigned" in d and "committee on" in d):
                    committee = d.split("committee on")[-1].strip()
                    action_type.append("referral-committee")

                if "committee report" in d:
                    if "pass" in d:
                        action_type.append("committee-passage")
                    if "fail" in d:
                        action_type.append("committee-failure")

                if "amendment" in d and "without amendment" not in d:
                    if "pass" in d or "prevail" in d or "adopted" in d:
                        action_type.append("amendment-passage")
                    if "fail" or "out of order" in d:
                        action_type.append("amendment-failure")
                    if "withdraw" in d:
                        action_type.append("amendment-withdrawal")

                if "signed by the governor" in d:
                    action_type.append("executive-signature")

                if len(action_type) == 0:
                    # calling it other and moving on with a warning
                    self.logger.warning("Could not recognize an action in '{}'".format(
                        action_desc))
                    action_type = None

                a = bill.add_action(chamber=action_chamber,
                                    description=action_desc,
                                    date=date,
                                    classification=action_type)
                if committee:
                    a.add_related_entity(committee, entity_type='organization')

            # subjects
            subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]]
            for subject in subjects:
                bill.add_subject(subject)

            # versions and votes
            for version in bill_json["versions"][::-1]:
                try:
                    version_json = client.get("bill_version",
                                              session=session,
                                              bill_id=version["billName"],
                                              version_id=version["printVersionName"])
                except scrapelib.HTTPError:
                    self.logger.warning("Bill version does not seem to exist.")
                    continue

                yield from self.deal_with_version(version_json, bill, bill_id,
                                                  original_chamber, session, proxy)

            yield bill
示例#37
0
    def parse_bill_status_page(self, url, page, session, chamber):
        # see 2007 HB 2... weird.
        parsed_url = urllib.parse.urlparse(url)
        parsed_query = dict(urllib.parse.parse_qsl(parsed_url.query))
        bill_id = "{0} {1}".format(
            parsed_query['P_BLTP_BILL_TYP_CD'],
            parsed_query['P_BILL_NO1'])

        try:
            xp = '//b[text()="Short Title:"]/../following-sibling::td/text()'
            title = page.xpath(xp).pop()
        except IndexError:
            title = page.xpath('//tr[1]/td[2]')[0].text_content()

        # Add bill type.
        _bill_id = bill_id.lower()
        if 'b' in _bill_id:
            classification = 'bill'
        elif 'j' in _bill_id or 'jr' in _bill_id:
            classification = 'joint resolution'
        elif 'cr' in _bill_id:
            classification = 'concurrent resolution'
        elif 'r' in _bill_id:
            classification = 'resolution'

        bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                    title=title, classification=classification)

        self.add_actions(bill, page)
        votes = self.add_votes(bill, page, url)

        tabledata = self._get_tabledata(page)

        # Add sponsor info.
        bill.add_sponsorship(tabledata['primary sponsor:'][0], classification='primary',
                             entity_type='person', primary=True)

        # A various plus fields MT provides.
        plus_fields = [
            'requester',
            ('chapter number:', 'chapter'),
            'transmittal date:',
            'drafter',
            'fiscal note probable:',
            'bill draft number:',
            'preintroduction required:',
            'by request of',
            'category:']

        for x in plus_fields:
            if isinstance(x, tuple):
                _key, key = x
            else:
                _key = key = x
                key = key.replace(' ', '_')

            try:
                val = tabledata[_key]
            except KeyError:
                continue

            if len(val) == 1:
                val = val[0]

            bill.extras[key] = val

        # Add bill subjects.
        xp = '//th[contains(., "Revenue/Approp.")]/ancestor::table/tr'
        subjects = []
        for tr in page.xpath(xp):
            try:
                subj = tr.xpath('td')[0].text_content()
            except IndexError:
                continue
            subjects.append(subj)

        for s in subjects:
            bill.add_subject(s)

        self.add_fiscal_notes(page, bill)

        return bill, list(votes)
示例#38
0
    def scrape_bill(self, chamber, session, session_id, bill_id, url):
        sidebar = lxml.html.fromstring(self.get(url).text)
        sidebar.make_links_absolute("https://www.legis.iowa.gov")

        hist_url = (
            f"https://www.legis.iowa.gov/legislation/billTracking/"
            f"billHistory?billName={bill_id}&ga={session_id}"
        )
        req_session = requests.Session()
        req = requests.get(hist_url)
        if req.status_code == 500:
            self.warning("500 error on {}, skipping".format(hist_url))
            return

        page = lxml.html.fromstring(req.text)
        page.make_links_absolute("https://www.legis.iowa.gov")

        title = page.xpath(
            'string(//div[@id="content"]/div[@class=' '"divideVert"]/div/div[4]/div[2])'
        ).strip()

        if title == "":
            # Sometimes the title is moved, see
            # https://www.legis.iowa.gov/legislation/billTracking/billHistory?billName=SF%20139&ga=88
            title = page.xpath(
                'string(//div[@id="content"]/div[@class=' '"divideVert"]/div[4]/div[2])'
            ).strip()
            if title == "":
                self.warning("URL: %s gives us an *EMPTY* bill. Aborting." % url)
                return

        if title.lower().startswith("in"):
            title = page.xpath("string(//table[2]/tr[3])").strip()

        if "HR" in bill_id or "SR" in bill_id:
            bill_type = ["resolution"]
        elif "HJR" in bill_id or "SJR" in bill_id:
            bill_type = ["joint resolution"]
        elif "HCR" in bill_id or "SCR" in bill_id:
            bill_type = ["concurrent resolution"]
        else:
            bill_type = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )

        bill.add_source(hist_url)

        # base url for text version (version_abbrev, session_id, bill_id)
        version_html_url_template = (
            "https://www.legis.iowa.gov/docs/"
            "publications/LG{}/{}/attachments/{}.html"
        )
        version_pdf_url_template = (
            "https://www.legis.iowa.gov/docs/" "publications/LG{}/{}/{}.pdf"
        )

        # get pieces of version_link
        vpieces = sidebar.xpath('//select[@id="billVersions"]/option')
        if vpieces:
            for version in vpieces:
                version_name = version.text
                version_abbrev = version.xpath("string(@value)")

                # Get HTML document of bill version.
                version_html_url = version_html_url_template.format(
                    version_abbrev.upper(), session_id, bill_id.replace(" ", "")
                )

                bill.add_version_link(
                    note=version_name, url=version_html_url, media_type="text/html"
                )

                # Get PDF document of bill version.
                version_pdf_url = version_pdf_url_template.format(
                    version_abbrev.upper(), session_id, bill_id.replace(" ", "")
                )

                if "Marked Up" in version_name:
                    version_pdf_url = sidebar.xpath(
                        "//iframe[@id='bbContextDoc']/@src"
                    )[0]

                bill.add_version_link(
                    note=version_name, url=version_pdf_url, media_type="application/pdf"
                )

        sponsors_str = page.xpath(
            'string(//div[@id="content"]/div[@class=' '"divideVert"]/div/div[4]/div[1])'
        ).strip()

        if re.search("^By ", sponsors_str):
            sponsors = re.split(",| and ", sponsors_str.split("By ")[1])
        # for some bills sponsors listed in different format
        else:
            sponsors = re.findall(
                r"[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)", sponsors_str
            )

        for sponsor in sponsors:
            sponsor = sponsor.replace(" and", "").strip(" .,")

            # a few sponsors get mangled by our regex
            sponsor = {
                "Means": "Ways & Means",
                "Iowa": "Economic Growth/Rebuild Iowa",
                "Safety": "Public Safety",
                "Resources": "Human Resources",
                "Affairs": "Veterans Affairs",
                "Protection": "Environmental Protection",
                "Government": "State Government",
                "Boef": "De Boef",
            }.get(sponsor, sponsor)

            if sponsor[0].islower():
                # SSBs catch cruft in it ('charges', 'overpayments')
                # https://sunlight.atlassian.net/browse/DATA-286
                continue

            bill.add_sponsorship(
                name=sponsor,
                classification="primary",
                entity_type="person",
                primary=True,
            )

        for tr in page.xpath(
            "//table[contains(@class, 'billActionTable')][1]/tbody/tr"
        ):
            date = tr.xpath("string(td[contains(text(), ', 20')])").strip()
            if date.startswith("***"):
                continue
            elif "No history is recorded at this time." in date:
                return
            if date == "":
                continue

            date = datetime.datetime.strptime(date, "%B %d, %Y").date()

            action = tr.xpath("string(td[3])").strip()
            action = re.sub(r"\s+", " ", action)

            # Capture any amendment links.
            links = [link for link in [version["links"] for version in bill.versions]]
            version_urls = [link["url"] for link in [i for sub in links for i in sub]]
            if "amendment" in action.lower():
                for anchor in tr.xpath(".//a[1]"):
                    if "-" in anchor.text:
                        # https://www.legis.iowa.gov/docs/publications/AMDI/88/S3071.pdf
                        amd_pattern = "https://www.legis.iowa.gov/docs/publications/AMDI/{}/{}.pdf"
                        amd_id = anchor.text.replace("-", "").strip()
                        amd_url = amd_pattern.format(session_id, amd_id)
                        amd_name = "Amendment {}".format(anchor.text.strip())

                        if amd_url not in version_urls:
                            bill.add_version_link(
                                note=amd_name, url=amd_url, media_type="application/pdf"
                            )
                            version_urls.append(amd_url)
                        else:
                            self.info("Already Added {}, skipping".format(amd_url))

            if "S.J." in action or "SCS" in action:
                actor = "upper"
            elif "H.J." in action or "HCS" in action:
                actor = "lower"
            else:
                actor = "legislature"

            action = re.sub(r"(H|S)\.J\.\s+\d+\.$", "", action).strip()

            if action.startswith("Introduced"):
                atype = ["introduction"]
                if ", referred to" in action:
                    atype.append("referral-committee")
            elif action.startswith("Read first time"):
                atype = "reading-1"
            elif action.startswith("Referred to"):
                atype = "referral-committee"
            elif action.startswith("Sent to Governor"):
                atype = "executive-receipt"
            elif action.startswith("Reported Signed by Governor"):
                atype = "executive-signature"
            elif action.startswith("Signed by Governor"):
                atype = "executive-signature"
            elif action.startswith("Vetoed by Governor"):
                atype = "executive-veto"
            elif action.startswith("Item veto"):
                atype = "executive-veto-line-item"
            elif re.match(r"Passed (House|Senate)", action):
                atype = "passage"
            elif re.match(r"Amendment (S|H)-\d+ filed", action):
                atype = ["amendment-introduction"]
                if ", adopted" in action:
                    atype.append("amendment-passage")
            elif re.match(r"Amendment (S|H)-\d+( as amended,)? adopted", action):
                atype = "amendment-passage"
            elif re.match(r"Amendment (S|N)-\d+ lost", action):
                atype = "amendment-failure"
            elif action.startswith("Resolution filed"):
                atype = "introduction"
            elif action.startswith("Resolution adopted"):
                atype = "passage"
            elif action.startswith("Committee report") and action.endswith("passage."):
                atype = "committee-passage"
            elif action.startswith("Withdrawn"):
                atype = "withdrawal"
            else:
                atype = None

            if action.strip() == "":
                continue

            if re.search(r"END OF \d+ ACTIONS", action):
                continue

            if "$history" not in action:
                bill.add_action(
                    description=action, date=date, chamber=actor, classification=atype
                )

        self.scrape_subjects(bill, bill_id, session, req_session)

        yield bill
示例#39
0
    def scrape(self, session=None, chamber=None):
        bill_type_map = {
            'B': 'bill',
            'R': 'resolution',
            'JR': 'joint resolution',
            'CR': 'concurrent resolution',
        }

        chamber_map = {
            'H': 'lower',
            'S': 'upper',
            'J': 'joint',
            'E': 'legislature',  # Effective date
        }

        action_code_map = {
            'HI': None,
            'SI': None,
            'HH': None,
            'SH': None,
            'HPF': ['introduction'],
            'HDSAS': None,
            'SPF': ['introduction'],
            'HSR': ['reading-2'],
            'SSR': ['reading-2'],
            'HFR': ['reading-1'],
            'SFR': ['reading-1'],
            'HRECM': ['withdrawal', 'referral-committee'],
            'SRECM': ['withdrawal', 'referral-committee'],
            'SW&C': ['withdrawal', 'referral-committee'],
            'HW&C': ['withdrawal', 'referral-committee'],
            'HRA': ['passage'],
            'SRA': ['passage'],
            'HPA': ['passage'],
            'HRECO': None,
            'SPA': ['passage'],
            'HTABL': None,  # 'House Tabled' - what is this?
            'SDHAS': None,
            'HCFR': ['committee-passage-favorable'],
            'SCFR': ['committee-passage-favorable'],
            'HRAR': ['referral-committee'],
            'SRAR': ['referral-committee'],
            'STR': ['reading-3'],
            'SAHAS': None,
            'SE': ['passage'],
            'SR': ['referral-committee'],
            'HTRL': ['reading-3', 'failure'],
            'HTR': ['reading-3'],
            'S3RLT': ['reading-3', 'failure'],
            'HASAS': None,
            'S3RPP': None,
            'STAB': None,
            'SRECO': None,
            'SAPPT': None,
            'HCA': None,
            'HNOM': None,
            'HTT': None,
            'STT': None,
            'SRECP': None,
            'SCRA': None,
            'SNOM': None,
            'S2R': ['reading-2'],
            'H2R': ['reading-2'],
            'SENG': ['passage'],
            'HENG': ['passage'],
            'HPOST': None,
            'HCAP': None,
            'SDSG': ['executive-signature'],
            'SSG': ['executive-receipt'],
            'Signed Gov': ['executive-signature'],
            'HDSG': ['executive-signature'],
            'HSG': ['executive-receipt'],
            'EFF': None,
            'HRP': None,
            'STH': None,
            'HTS': None,
        }

        if not session:
            session = self.latest_session()
            self.info('no session specified, using %s', session)
        sid = SESSION_SITE_IDS[session]

        legislation = backoff(
            self.lservice.GetLegislationForSession,
            sid
        )['LegislationIndex']

        for leg in legislation:
            lid = leg['Id']
            instrument = backoff(self.lservice.GetLegislationDetail, lid)
            history = [x for x in instrument['StatusHistory'][0]]

            actions = reversed([{
                'code': x['Code'],
                'action': x['Description'],
                '_guid': x['Id'],
                'date': x['Date']
            } for x in history])

            guid = instrument['Id']

            # A little bit hacky.
            bill_prefix = instrument['DocumentType']
            bill_chamber = chamber_map[bill_prefix[0]]
            bill_type = bill_type_map[bill_prefix[1:]]

            bill_id = '%s %s' % (
                bill_prefix,
                instrument['Number'],
            )
            if instrument['Suffix']:
                bill_id += instrument['Suffix']

            title = instrument['Caption']
            description = instrument['Summary']

            if title is None:
                continue

            bill = Bill(
                bill_id, legislative_session=session, chamber=bill_chamber, title=title,
                classification=bill_type)
            bill.add_abstract(description, note='description')
            bill.extras = {'guid': guid}

            if instrument['Votes']:
                for vote_ in instrument['Votes']:
                    _, vote_ = vote_
                    vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId'])

                    vote = VoteEvent(
                        start_date=vote_['Date'].strftime('%Y-%m-%d'),
                        motion_text=vote_['Caption'] or 'Vote on Bill',
                        chamber={'House': 'lower', 'Senate': 'upper'}[vote_['Branch']],
                        result='pass' if vote_['Yeas'] > vote_['Nays'] else 'fail',
                        classification='passage',
                        bill=bill,
                    )
                    vote.set_count('yes', vote_['Yeas'])
                    vote.set_count('no', vote_['Nays'])
                    vote.set_count('other', vote_['Excused'] + vote_['NotVoting'])

                    vote.add_source(self.vsource)

                    methods = {'Yea': 'yes', 'Nay': 'no'}

                    for vdetail in vote_['Votes'][0]:
                        whom = vdetail['Member']
                        how = vdetail['MemberVoted']
                        vote.vote(methods.get(how, 'other'), whom['Name'])

                    yield vote

            ccommittees = defaultdict(list)
            committees = instrument['Committees']
            if committees:
                for committee in committees[0]:
                    ccommittees[{
                        'House': 'lower',
                        'Senate': 'upper',
                    }[committee['Type']]].append(committee['Name'])

            for action in actions:
                action_chamber = chamber_map[action['code'][0]]

                try:
                    action_types = action_code_map[action['code']]
                except KeyError:
                    error_msg = 'Code {code} for action {action} not recognized.'.format(
                        code=action['code'], action=action['action'])

                    self.logger.warning(error_msg)

                    action_types = None

                committees = []
                if action_types and any(('committee' in x for x in action_types)):
                    committees = [str(x) for x in ccommittees.get(
                        action_chamber, [])]

                act = bill.add_action(
                    action['action'], action['date'].strftime('%Y-%m-%d'),
                    classification=action_types,
                    chamber=action_chamber)
                for committee in committees:
                    act.add_related_entity(committee, 'organization')
                act.extras = {
                    'code': action['code'],
                    'guid': action['_guid'],
                }

            sponsors = []
            if instrument['Authors']:
                sponsors = instrument['Authors']['Sponsorship']
                if 'Sponsors' in instrument and instrument['Sponsors']:
                    sponsors += instrument['Sponsors']['Sponsorship']

            sponsors = [
                (x['Type'], self.get_member(x['MemberId'])) for x in sponsors
            ]

            for typ, sponsor in sponsors:
                name = '{First} {Last}'.format(**dict(sponsor['Name']))
                bill.add_sponsorship(
                    name,
                    entity_type='person',
                    classification='primary' if 'Author' in typ else 'secondary',
                    primary='Author' in typ,
                )

            for version in instrument['Versions']['DocumentDescription']:
                name, url, doc_id, version_id = [
                    version[x] for x in [
                        'Description',
                        'Url',
                        'Id',
                        'Version'
                    ]
                ]
                # link = bill.add_version_link(
                #     name, url, media_type='application/pdf')
                # link['extras'] = {
                #     '_internal_document_id': doc_id,
                #     '_version_id': version_id
                # }

            bill.add_source(self.msource)
            bill.add_source(self.lsource)
            bill.add_source(SOURCE_URL.format(**{
                'session': session,
                'bid': guid,
            }))

            yield bill
示例#40
0
    def scrape(self, session=None):
        if not session:
            session = self.latest_session()
            self.info('no session specified, using %s', session)

            # chambers = [chamber] if chamber else ['upper','lower']
            # chambers = [chamber]
            # if chamber else ['upper','lower']

            #for chamber in chambers:
            #    chambers = [chamber]

            # yield from self.scrape_chamber(session)

        #get member id matching for vote parsing
        member_ids = self.get_member_ids()[session]
        per_page = 10  #seems like it gives me 10 no matter what.
        start_record = 0

        headers = {"Content-Type": "application/json"}
        url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicAdvancedSearch"
        bill_url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicData"
        params = {
            "request": {
                "sEcho": 2,
                "iColumns": 4,
                "sColumns": "",
                "iDisplayStart": 0,
                "iDisplayLength": per_page,
                "mDataProp_0": "ShortTitle",
                "mDataProp_1": "Title",
                "mDataProp_2": "LegislationCategories",
                "mDataProp_3": "Modified",
                "iSortCol_0": 0,
                "sSortDir_0": "asc",
                "iSortingCols": 0,
                "bSortable_0": "true",
                "bSortable_1": "true",
                "bSortable_2": "true",
                "bSortable_3": "true"
            },
            "criteria": {
                "Keyword": "",
                "Category": "",
                "SubCategoryId": "",
                "RequestOf": "",
                "CouncilPeriod": str(session),
                "Introducer": "",
                "CoSponsor": "",
                "CommitteeReferral": "",
                "CommitteeReferralComments": "",
                "StartDate": "",
                "EndDate": "",
                "QueryLimit": 100,
                "FilterType": "",
                "Phases": "",
                "LegislationStatus": "0",
                "IncludeDocumentSearch": "false"
            }
        }
        param_json = json.dumps(params)
        response = self.post(url, headers=headers, data=param_json)
        #the response is a terrible string-of-nested-json-strings. Yuck.
        response = decode_json(response.json()["d"])
        data = response["aaData"]

        global bill_versions

        while len(data) > 0:

            for bill in data:

                bill_versions = [
                ]  #sometimes they're in there more than once, so we'll keep track

                bill_id = bill["Title"]
                if bill_id.startswith("AG"):
                    #actually an agenda, skip
                    continue
                bill_params = {"legislationId": bill_id}
                bill_info = self.post(bill_url,
                                      headers=headers,
                                      data=json.dumps(bill_params))
                bill_info = decode_json(bill_info.json()["d"])["data"]
                bill_source_url = "http://lims.dccouncil.us/Legislation/" + bill_id

                legislation_info = bill_info["Legislation"][0]
                title = legislation_info["ShortTitle"]

                if bill_id.startswith("R") or bill_id.startswith("CER"):
                    bill_type = "resolution"
                else:
                    bill_type = "bill"

                #dc has no chambers. calling it all upper
                # bill = Bill(session,"upper", bill_id, title, type=bill_type)
                # bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type)
                bill = Bill(bill_id,
                            legislative_session=session,
                            title=title,
                            classification=bill_type)

                #sponsors and cosponsors
                if "Introducer" in legislation_info:
                    introducers = legislation_info["Introducer"]
                    intro_date = self.date_format(
                        legislation_info["IntroductionDate"])
                    # bill.add_action("upper",
                    #                "Introduced",
                    #               intro_date,
                    #                type="introduction")
                    bill.add_action("Introduced",
                                    intro_date,
                                    chamber="upper",
                                    classification="introduction")
                else:
                    #sometimes there are introducers, sometimes not.
                    # Set Introducers to empty array to avoid downstream breakage, but log bills without introducers
                    self.logger.warning("No Introducer: {0} {1}: {2}".format(
                        bill['chamber'], bill['session'], bill['bill_id']))
                    introducers = []

                try:
                    #sometimes there are cosponsors, sometimes not.
                    cosponsors = legislation_info["CoSponsor"]
                except KeyError:
                    cosponsors = []

                for i in introducers:
                    name = i["Name"]
                    #they messed up Phil Mendelson's name
                    if name == "Phil Pmendelson":
                        name = "Phil Mendelson"
                    # bill.add_sponsor(name=sponsor_name,type="primary")
                    bill.add_sponsorship(name,
                                         classification=cosponsors,
                                         entity_type='person',
                                         primary=True)

                for s in cosponsors:
                    name = s["Name"]
                    if name == "Phil Pmendelson":
                        name = "Phil Mendelson"
                    bill.add_sponsor(name=name, type="cosponsor")

                #if it's become law, add the law number as an alternate title
                if "LawNumber" in legislation_info:
                    law_num = legislation_info["LawNumber"]
                    if law_num:
                        bill.add_title(law_num)

                #also sometimes it's got an act number
                if "ActNumber" in legislation_info:
                    act_num = legislation_info["ActNumber"]
                    if act_num:
                        bill.add_title(act_num)

                #sometimes AdditionalInformation has a previous bill name
                if "AdditionalInformation" in legislation_info:
                    add_info = legislation_info["AdditionalInformation"]
                    if "previously" in add_info.lower():
                        prev_title = add_info.lower().replace(
                            "previously", "").strip().replace(" ", "")
                        bill.add_title(prev_title.upper())
                    elif add_info:
                        bill["additional_information"] = add_info

                if "WithDrawnDate" in legislation_info:
                    withdrawn_date = self.date_format(
                        legislation_info["WithDrawnDate"])
                    withdrawn_by = legislation_info["WithdrawnBy"][0][
                        "Name"].strip()
                    if withdrawn_by == "the Mayor":

                        bill.add_action("withdrawn",
                                        withdrawn_date,
                                        chamber="executive",
                                        classification="withdrawal")

                    elif "committee" in withdrawn_by.lower():
                        bill.add_action("withdrawn",
                                        withdrawn_date,
                                        chamber="upper",
                                        classification="withdrawal",
                                        committees=withdrawn_by)
                    else:
                        bill.add_action("withdrawn",
                                        withdrawn_date,
                                        chamber="upper",
                                        classification="withdrawal",
                                        legislators=withdrawn_by)

                #deal with actions involving the mayor
                mayor = bill_info["MayorReview"]
                if mayor != []:
                    mayor = mayor[0]

                    #in dc, mayor == governor because openstates schema
                    if "TransmittedDate" in mayor:
                        transmitted_date = self.date_format(
                            mayor["TransmittedDate"])

                        bill.add_action("transmitted to mayor",
                                        transmitted_date,
                                        chamber="executive",
                                        classification="executive-receipt")

                    if 'SignedDate' in mayor:
                        signed_date = self.date_format(mayor["SignedDate"])

                        bill.add_action("signed",
                                        signed_date,
                                        chamber="executive",
                                        classification="executive-signature")

                    elif 'ReturnedDate' in mayor:  #if returned but not signed, it was vetoed
                        veto_date = self.date_format(mayor["ReturnedDate"])

                        bill.add_action("vetoed",
                                        veto_date,
                                        chamber="executive",
                                        classification="executive-veto")

                        if 'EnactedDate' in mayor:  #if it was returned and enacted but not signed, there was a veto override
                            override_date = self.date_format(
                                mayor["EnactedDate"])

                            bill.add_action(
                                "veto override",
                                override_date,
                                chamber="upper",
                                classification="veto-override-passage")

                    if 'AttachmentPath' in mayor:
                        #documents relating to the mayor's review
                        self.add_documents(mayor["AttachmentPath"], bill)

                congress = bill_info["CongressReview"]
                if len(congress) > 0:
                    congress = congress[0]
                    if "TransmittedDate" in congress:
                        transmitted_date = self.date_format(
                            congress["TransmittedDate"])

                        bill.add_action("Transmitted to Congress for review",
                                        transmitted_date,
                                        chamber="other")

                #deal with committee actions
                if "DateRead" in legislation_info:
                    date = legislation_info["DateRead"]
                elif "IntroductionDate" in legislation_info:
                    date = legislation_info["IntroductionDate"]
                else:
                    self.logger.warning(
                        "Crap, we can't find anything that looks like an action date. Skipping"
                    )
                    continue
                date = self.date_format(date)
                if "CommitteeReferral" in legislation_info:
                    committees = []
                    for committee in legislation_info["CommitteeReferral"]:
                        if committee["Name"].lower(
                        ) == "retained by the council":
                            committees = []
                            break
                        else:
                            committees.append(committee["Name"])
                    if committees != []:
                        bill.add_action("referred to committee",
                                        date,
                                        chamber="upper",
                                        committees=committees,
                                        classification="referral-committee")

                if "CommitteeReferralComments" in legislation_info:
                    committees = []
                    for committee in legislation_info[
                            "CommitteeReferralComments"]:
                        committees.append(committee["Name"])
                    bill.add_action("comments from committee",
                                    date,
                                    chamber="upper",
                                    committees=committees,
                                    classification="other")

                #deal with random docs floating around
                docs = bill_info["OtherDocuments"]
                for d in docs:
                    if "AttachmentPath" in d:
                        self.add_documents(d["AttachmentPath"], bill)
                    else:
                        self.logger.warning(
                            "Document path missing from 'Other Documents'")

                if "MemoLink" in legislation_info:
                    self.add_documents(legislation_info["MemoLink"], bill)

                if "AttachmentPath" in legislation_info:
                    self.add_documents(legislation_info["AttachmentPath"],
                                       bill)

                #full council votes
                votes = bill_info["VotingSummary"]
                for vote in votes:
                    self.process_vote(vote, bill, member_ids)

                #deal with committee votes
                if "CommitteeMarkup" in bill_info:
                    committee_info = bill_info["CommitteeMarkup"]
                    if len(committee_info) > 0:
                        for committee_action in committee_info:
                            self.process_committee_vote(committee_action, bill)
                        if "AttachmentPath" in committee_info:
                            self.add_documents(vote["AttachmentPath"], bill,
                                               is_version)

                bill.add_source(bill_source_url)
                self.save_bill(bill)

            #get next page
            start_record += per_page
            params["request"]["iDisplayStart"] = start_record
            param_json = json.dumps(params)
            response = self.post(url, headers=headers, data=param_json)
            response = decode_json(response.json()["d"])
            data = response["aaData"]
示例#41
0
    def scrape(self, session=None):
        HTML_TAGS_RE = r'<.*?>'

        if session is None:
            session = self.latest_session()

        year_slug = self.jurisdiction.get_year_slug(session)

        # Load all bills and resolutions via the private API
        bills_url = \
            'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\
            format(year_slug)
        bills_json = self.get(bills_url).text
        bills = json.loads(bills_json)['data'] or []

        bills_url = \
            'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\
            format(year_slug)
        bills_json = self.get(bills_url).text
        bills.extend(json.loads(bills_json)['data'] or [])

        resolutions_url = \
            'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\
            format(year_slug)
        resolutions_json = self.get(resolutions_url).text
        bills.extend(json.loads(resolutions_json)['data'] or [])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.items()}

            # Identify the bill type and chamber
            if info['BillNumber'].startswith('J.R.H.'):
                bill_type = 'joint resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('J.R.S.'):
                bill_type = 'joint resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.R.'):
                bill_type = 'resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.R.'):
                bill_type = 'resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('PR.'):
                bill_type = 'constitutional amendment'
                if info['Body'] == 'H':
                    bill_chamber = 'lower'
                elif info['Body'] == 'S':
                    bill_chamber = 'upper'
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info['BillNumber'].startswith('H.'):
                bill_type = 'bill'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.'):
                bill_type = 'bill'
                bill_chamber = 'upper'

            else:
                raise AssertionError(
                    "Unknown bill type found: '{}'".
                    format(info['BillNumber'])
                )

            bill_id = info['BillNumber'].replace('.', '').replace(' ', '')
            # put one space back in between type and number
            bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id)

            # Create the bill using its basic information
            bill = Bill(
                identifier=bill_id,
                legislative_session=session,
                chamber=bill_chamber,
                title=info['Title'],
                classification=bill_type
            )
            if 'resolution' in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\
                format(year_slug, info['BillNumber'])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                'following-sibling::dd[1]/ul/li'
            )
            sponsor_type = 'primary'
            for sponsor in sponsors:
                if sponsor.xpath('span/text()') == ['Additional Sponsors']:
                    sponsor_type = 'cosponsor'
                    continue

                sponsor_name = sponsor.xpath('a/text()')[0].\
                    replace("Rep.", "").replace("Sen.", "").strip()
                if sponsor_name and not \
                        (sponsor_name[:5] == "Less" and len(sponsor_name) == 5):
                    bill.add_sponsorship(
                        name=sponsor_name,
                        classification=sponsor_type,
                        entity_type='person',
                        primary=(sponsor_type == 'primary')
                    )

            # Capture bill text versions
            # Warning: There's a TODO in VT's source code saying 'move this to where it used to be'
            # so leave in the old and new positions
            versions = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                'following-sibling::dd[1]/ul/li/a |'
                '//ul[@class="bill-path"]//a'
            )

            for version in versions:
                if version.xpath('text()'):
                    bill.add_version_link(
                        note=version.xpath('text()')[0],
                        url=version.xpath('@href')[0].replace(' ', '%20'),
                        media_type='application/pdf'
                    )

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                    r'"bill/loadBillDetailedStatus/.+?/(\d+)"',
                    lxml.etree.tostring(doc).decode('utf-8')
                ).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".format(info['BillNumber']))
                yield bill
                continue

            # Capture actions
            actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\
                format(year_slug, internal_bill_id)
            actions_json = self.get(actions_url).text
            actions = json.loads(actions_json)['data']
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = {k: v for k, v in action.items() if v is not None}

                if "Signed by Governor" in action['FullStatus']:
                    actor = 'executive'
                elif action['ChamberCode'] == 'H':
                    actor = 'lower'
                elif action['ChamberCode'] == 'S':
                    actor = 'upper'
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action['FullStatus']:
                    # assert chambers_passed == set("HS")
                    action_type = 'executive-signature'
                elif "Vetoed by the Governor" in action['FullStatus']:
                    action_type = 'executive-veto'
                elif "Read first time" in action['FullStatus'] \
                        or "Read 1st time" in action['FullStatus']:
                    action_type = 'introduction'
                elif "Reported favorably" in action['FullStatus']:
                    action_type = 'committee-passage-favorable'
                elif actor == 'lower' and any(x.lower().startswith('aspassed')
                                              for x in action['keywords'].split(';')):
                    action_type = 'passage'
                    chambers_passed.add("H")
                elif actor == 'upper' and any(x.lower().startswith(' aspassed')
                                              or x.lower().startswith('aspassed')
                                              for x in action['keywords'].split(';')):
                    action_type = 'passage'
                    chambers_passed.add("S")
                else:
                    action_type = None

                bill.add_action(
                    description=re.sub(HTML_TAGS_RE, "", action['FullStatus']),
                    date=datetime.datetime.strftime(
                        datetime.datetime.strptime(action['StatusDate'], '%m/%d/%Y'),
                        '%Y-%m-%d'
                    ),
                    chamber=actor,
                    classification=action_type
                )

            # Capture votes
            votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.format(
                year_slug, internal_bill_id)
            votes_json = self.get(votes_url).text
            votes = json.loads(votes_json)['data']
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote['VoteHeaderID']
                roll_call_url = ('http://legislature.vermont.gov/bill/'
                                 'loadBillRollCallDetails/{0}/{1}'.format(
                                     year_slug, roll_call_id))
                roll_call_json = self.get(roll_call_url).text
                roll_call = json.loads(roll_call_json)['data']

                roll_call_yea = []
                roll_call_nay = []
                roll_call_not_voting = []
                for member in roll_call:
                    (member_name, _district) = member['MemberName'].split(" of ")
                    member_name = member_name.strip()

                    if member['MemberVote'] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member['MemberVote'] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_not_voting.append(member_name)

                if ("Passed -- " in vote['FullStatus'] or
                        "Veto of Governor overridden" in vote['FullStatus']):
                    did_pass = True
                elif ("Failed -- " in vote['FullStatus'] or
                      'Veto of the Governor sustained' in vote['FullStatus']):
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear")

                # Check vote counts
                yea_count = int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1))
                nay_count = int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1))

                vote_to_add = VoteEvent(
                    bill=bill,
                    chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'),
                    start_date=datetime.datetime.strftime(
                        datetime.datetime.strptime(vote['StatusDate'], '%m/%d/%Y'),
                        '%Y-%m-%d'
                    ),
                    motion_text=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(),
                    result='pass' if did_pass else 'fail',
                    classification='passage',
                    legislative_session=session,
                )
                vote_to_add.add_source(roll_call_url)

                vote_to_add.set_count('yes', yea_count)
                vote_to_add.set_count('no', nay_count)
                vote_to_add.set_count('not voting', len(roll_call_not_voting))

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_not_voting:
                    vote_to_add.vote('not voting', member)

                yield vote_to_add

            # Capture extra information-  Not yet implemented
            # Witnesses:
            #   http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            # Conference committee members:
            #   http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            # Committee meetings:
            #   http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}

            yield bill
示例#42
0
    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.lxmlize(url)

        if re.match(r'^(S|H)B ', bill_id):
            btype = ['bill']
        elif re.match(r'(S|H)C ', bill_id):
            btype = ['commemoration']
        elif re.match(r'(S|H)JR ', bill_id):
            btype = ['joint resolution']
        elif re.match(r'(S|H)CR ', bill_id):
            btype = ['concurrent resolution']
        else:
            btype = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=btype
                    )
        bill.add_source(url)

        regex_ns = "http://exslt.org/regular-expressions"
        version_links = page.xpath(
            "//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]",
            namespaces={'re': regex_ns})
        for link in version_links:
            bill.add_version_link(
                                link.xpath('string()').strip(),
                                link.attrib['href'],
                                media_type='text/html',
                                on_duplicate='ignore'
                )

        sponsor_links = page.xpath(
            "//td[contains(@id, 'tdSponsors')]/a")
        for link in sponsor_links:
            bill.add_sponsorship(
                    link.text,
                    classification='primary',
                    primary=True,
                    entity_type='person'
                )

        actor = chamber
        use_row = False
        self.debug(bill_id)
        for row in page.xpath("//table[contains(@id, 'BillActions')]/tr"):

            if 'Date' in row.text_content() and 'Action' in row.text_content():
                use_row = True
                continue
            elif not use_row:
                continue

            action = row.xpath("string(td[2])").strip()

            atypes = []
            if action.startswith('First read'):
                atypes.append('introduction')
                atypes.append('reading-1')
            elif action.startswith('Signed by Governor'):
                atypes.append('executive-signature')
                actor = 'executive'

            match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)',
                             action)
            if match:
                if match.group(1) in ['Senate',
                                      'House of Representatives']:
                    first = ''
                else:
                    first = 'committee-'
                if match.group(3).lower() == 'passed':
                    second = 'passage'
                elif match.group(3).lower() == 'failed':
                    second = 'failure'
                atypes.append("%s%s" % (first, second))

            if 'referred to' in action.lower():
                atypes.append('referral-committee')

            if 'Motion to amend, Passed Amendment' in action:
                atypes.append('amendment-introduction')
                atypes.append('amendment-passage')

            if 'Veto override, Passed' in action:
                atypes.append('veto-override-passage')
            elif 'Veto override, Failed' in action:
                atypes.append('veto-override-failure')

            if 'Delivered to the Governor' in action:
                atypes.append('executive-receipt')

            match = re.match("First read in (Senate|House)", action)
            if match:
                if match.group(1) == 'Senate':
                    actor = 'upper'
                else:
                    actor = 'lower'

            date = row.xpath("string(td[1])").strip()
            match = re.match('\d{2}/\d{2}/\d{4}', date)
            if not match:
                self.warning("Bad date: %s" % date)
                continue
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"):
                yield from self.scrape_vote(bill, date, link.attrib['href'])

            bill.add_action(action, date, chamber=actor, classification=atypes)

        for link in page.xpath("//a[contains(@href, 'Keyword')]"):
            bill.add_subject(link.text.strip())

        yield bill
示例#43
0
    def scrape_senate_bills(self, chamber, insert, session, year):
        doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution',
                    8: 'joint resolution'}

        for docnum, bill_type in doc_type.items():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/' \
                             'HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count += 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)

                page = self.get(page_path).text
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)

                bill_id = root.xpath('string(/html/body/div[@id="content"]' +
                                     '/table[1]/tr[1]/td[1]/font)')
                title = self.get_node(
                    root,
                    '//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
                    'b[contains(text(), "By:")]]/td/em/text()')

                bill = Bill(bill_id,
                            legislative_session=session,
                            chamber=chamber,
                            title=title,
                            classification=bill_type
                            )
                bill.subject = list(set(self.subject_mapping[bill_id]))

                for table in root.xpath('//div[@id="content"]/table'):
                    if 'Bill Text' in table.text_content():
                        bill_text = table.xpath("string(tr/td[2]/a/@href)")
                        text_url = "http://www.leg.state.nv.us" + bill_text
                        bill.add_version_link(note="Bill Text",
                                              url=text_url,
                                              media_type='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsorship(name=leg,
                                         classification='primary',
                                         entity_type='person',
                                         primary=True)
                for leg in secondary:
                    bill.add_sponsorship(name=leg,
                                         classification='cosponsor',
                                         entity_type='person',
                                         primary=False)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes = mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda"
                    # bill.add_document(minutes_date, minutes_url)
                    bill.add_document_link(note=minutes_date,
                                           url=minutes_url)
                    minutes_count = minutes_count + 1

                self.scrape_actions(root, bill, "upper")
                yield from self.scrape_votes(page, page_path, bill, insert, year)
                bill.add_source(page_path)
                yield bill
示例#44
0
    def scrape_matter(self, matter_link, sess):
        matter_types = {
        "Additions":"other",
        "Administrative Order":"order",
        "Annual Evaluation":"other",
        "Bid Advertisement":"other",
        "Bid Awards":"other",
        "Bid Contract":"contract",
        "Bid Protest":"other",
        "Bid Rejection":"other",
        "Birthday Scroll":"commemoration",
        "Certificate of Appreciation":"commemoration",
        "Change Order":"order",
        "Citizen's Presentation":"other",
        "Commendation":"commemoration",
        "Conflict Waiver":"other",
        "Congratulatory Certificate":"commemoration",
        "Deferrals":"other",
        "Discussion Item":"other",
        "Distinguished Visitor":"other",
        "Joint Meeting/Workshop":"other",
        "Mayoral Veto":"other",
        "Miscellaneous":"other",
        "Nomination":"nomination",
        "Oath of Office":"other",
        "Omnibus Reserve":"bill",
        "Ordinance":"ordinance",
        "Plaque":"commemoration",
        "Presentation":"other",
        "Proclamation":"proclamation",
        "Professional Service Agreement":"contract",
        "Public Hearing":"other",
        "Report":"other",
        "Request for Proposals":"other",
        "Request for Qualifications":"other",
        "Request to Advertise":"other",
        "Resolution":"resolution",
        "Resolution of Sympathy":"resolution",
        "Service Awards":"commemoration",
        "Special Item":"other",
        "Special Presentation":"other",
        "Supplement":"other",
        "Swearing-In":"other",
        "Time Sensitive Items":"other",
        "Withdrawals":"other",
        "Workshop Item":"other",
        "Zoning":"other",
        "Zoning Resolution":"resolution"
        }
        matter_doc = self.lxmlize(matter_link)
        info_dict = self.matter_table_to_dict(matter_doc)
        #we're going to use the year of the intro date as the session
        #until/unless we come up with something better
        intro_date = datetime.strptime(info_dict["Introduced"],"%m/%d/%Y")
        session = sess["identifier"]
        category = matter_types[info_dict["File Type"]]
        if 'File Name' in info_dict:
            title = info_dict["File Name"]
        elif "Title" in info_dict and info_dict["Title"].strip():
            title = info_dict["Title"].strip()
        else:
            self.warning("bill has no title")
            return
        if category == 'other':
            bill = Bill(identifier=info_dict["File Number"],
                legislative_session=session,
                title=title
                )
        else:
            bill = Bill(identifier=info_dict["File Number"],
                legislative_session=session,
                title=title,
                classification=category
                )
        for spons in info_dict["Sponsors"]:
            if spons == "NONE":
                continue
            try:
                name,spons_type = spons.rsplit(",",1)
            except ValueError:
                name = spons
                spons_type = "Sponsor"
            primary = True if "Prime Sponsor" in spons_type else False
            entity = "person"
            if "committee" in name:
                entity = committee
            bill.add_sponsorship(name,spons_type,entity,primary)
        if "Indexes" in info_dict:
            for subj in info_dict["Indexes"]:
                if subj.strip() and subj.strip() != "NONE":
                    bill.add_subject(subj.strip())
        if "Title" in info_dict and info_dict["Title"].strip():
            note = "bill's long title'"
            if ("Note" in info_dict and info_dict["Note"].strip()):
                note = info_dict["Note"]
            bill.add_abstract(abstract=info_dict["Title"],note=note)
        self.process_action_table(matter_doc,bill)
        bill.add_source(matter_link, note='web')

        yield bill
示例#45
0
    def scrape_assem_bills(self, chamber, insert, session, year):

        doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution',
                    6: 'joint resolution', 9: 'petition'}
        for docnum, bill_type in doc_type.items():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/' \
                             'Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
                page = self.get(page_path).text
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)
                root.make_links_absolute("http://www.leg.state.nv.us/")

                bill_id = root.xpath('string(/html/body/div[@id="content"]'
                                     '/table[1]/tr[1]/td[1]/font)')
                title = self.get_node(
                    root,
                    '//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
                    'b[contains(text(), "By:")]]/td/em/text()')

                bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                            title=title, classification=bill_type)

                bill.subject = list(set(self.subject_mapping[bill_id]))
                billtext = root.xpath("//b[text()='Bill Text']")[0].getparent().getnext()
                text_urls = billtext.xpath("./a")
                for text_url in text_urls:
                    version_name = text_url.text.strip()
                    version_url = text_url.attrib['href']
                    bill.add_version_link(note=version_name, url=version_url,
                                          media_type='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsorship(classification='primary',
                                         name=leg, entity_type='person',
                                         primary=True)
                for leg in secondary:
                    bill.add_sponsorship(classification='cosponsor',
                                         name=leg, entity_type='person',
                                         primary=False)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes = mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes"
                    bill.add_document_link(note=minutes_date, url=minutes_url)
                    minutes_count += 1

                self.scrape_actions(root, bill, "lower")
                yield from self.scrape_votes(page, page_path, bill, insert, year)
                bill.add_source(page_path)
                yield bill
示例#46
0
    def scrape(self):
        three_days_ago = datetime.datetime.now() - datetime.timedelta(3)
        for matter in self.matters(three_days_ago):
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            legistar_web = self.legislation_detail_url(matter_id)
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
示例#47
0
    def scrape(self):
        unreachable_urls = []

        for leg_summary in self.legislation(created_after=datetime.datetime(2015, 5, 17)) :
            title = leg_summary['Title'].strip()

            if not title or not leg_summary['Intro\xa0Date'] :
                continue
                # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search=
                # doesn't have an intro date

            bill_type = BILL_TYPES[leg_summary['Type']]

            bill_session = self.session(self.toTime(leg_summary['Intro\xa0Date']))
            bill = Bill(identifier=leg_summary['Record #'],
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Chicago City Council"})

            bill.add_source(leg_summary['url'])

            try :
                leg_details = self.legDetails(leg_summary['url'])
            except IndexError :
                unreachable_urls.append(leg_summary['url'])
                yield bill
                continue

            for related_bill in leg_details.get('Related files', []) :
                lower_title = title.lower()
                if "sundry" in title or "miscellaneous" in title: #these are ominbus
                    bill.add_related_bill(identifier = related_bill['label'],
                                          legislative_session = bill.legislative_session,
                                          relation_type='replaces')
                #for now we're skipping related bills if they
                #don't contain words that make us think they're
                #in a ominbus relationship with each other
                
            for i, sponsor in enumerate(leg_details.get('Sponsors', [])) :
                if i == 0 :
                    primary = True
                    sponsorship_type = "Primary"
                else :
                    primary = False
                    sponsorship_type = "Regular"

                sponsor_name = sponsor['label']

                # Does the Mayor/Clerk introduce legisislation as
                # individuals role holders or as the OFfice of City
                # Clerk and the Office of the Mayor?
                entity_type = 'person'
                if sponsor_name.startswith(('City Clerk', 
                                            'Mendoza, Susana')) :
                    sponsor_name = 'Office of the City Clerk'
                    entity_type = 'organization'
                elif sponsor_name.startswith(('Emanuel, Rahm',)) :
                    sponsor_name = 'Office of the Mayor'
                    entity_type = 'organization'
                if not sponsor_name.startswith(('Misc. Transmittal',
                                                'No Sponsor',
                                                'Dept./Agency')) :
                    bill.add_sponsorship(sponsor_name, 
                                         sponsorship_type,
                                         entity_type,
                                         primary,
                                         entity_id = _make_pseudo_id(name=sponsor_name))

            if 'Topic' in leg_details :
                for subject in leg_details[u'Topic'].split(',') :
                    bill.add_subject(subject)

            for attachment in leg_details.get('Attachments', []) :
                if attachment['label'] :
                    bill.add_version_link(attachment['label'],
                                          attachment['url'],
                                          media_type="application/pdf")

            for action in self.history(leg_summary['url']) :
                action_description = action['Action']
                try :
                    action_date =  self.toTime(action['Date']).date().isoformat()
                except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492
                    continue

                if action_description :
                    try :
                        responsible_org = action['Action\xa0By']['label']
                    except TypeError  :
                        responsible_org = action['Action\xa0By']
                    if responsible_org == 'City Council' :
                        responsible_org = 'Chicago City Council'

                    act = bill.add_action(action_description,
                                          action_date,
                                          organization={'name': responsible_org},
                                          classification=ACTION_CLASSIFICATION[action_description])

                    if action_description == 'Referred' :
                        try :
                            leg_details['Current Controlling Legislative Body']['label']
                            controlling_bodies = [leg_details['Current Controlling Legislative Body']]
                        except TypeError :
                            controlling_bodies = leg_details['Current Controlling Legislative Body']
                        if controlling_bodies :
                            for controlling_body in controlling_bodies :
                                body_name = controlling_body['label']
                                if body_name.startswith("Joint Committee") :
                                    act.add_related_entity(body_name,
                                                           'organization')
                                else :
                                    act.add_related_entity(body_name,
                                                           'organization',
                                                           entity_id = _make_pseudo_id(name=body_name))


                    if 'url' in action['Action\xa0Details'] :
                        action_detail_url = action['Action\xa0Details']['url']
                        result, votes = self.extractVotes(action_detail_url)

                        if votes and result : # see https://github.com/datamade/municipal-scrapers-us/issues/15
                            action_vote = VoteEvent(legislative_session=bill.legislative_session, 
                                               motion_text=action_description,
                                               organization={'name': responsible_org},
                                               classification=None,
                                               start_date=action_date,
                                               result=result,
                                               bill=bill)
                            action_vote.add_source(action_detail_url)

                            for option, voter in votes :
                                action_vote.vote(option, voter)

                            yield action_vote

            bill.extras = {'local_classification' : leg_summary['Type']}
                            
            yield bill
        print(unreachable_urls)
示例#48
0
    def scrape(self, session=None, chambers=None):
        # Bills endpoint can sometimes take a very long time to load
        self.timeout = 300

        if not session:
            session = self.latest_session()
            self.info('no session, using %s', session)

        if int(session) < 128:
            raise AssertionError("No data for period {}".format(session))

        elif int(session) < 131:
            # they changed their data format starting in 131st and added
            # an undocumented API
            yield from self.old_scrape(session)

        else:
            chamber_dict = {"Senate": "upper", "House": "lower",
                            "House of Representatives": "lower",
                            "house": "lower", "senate": "upper"}

            # so presumanbly not everything passes, but we haven't
            # seen anything not pass yet, so we'll need to wait
            # till it fails and get the right language in here
            vote_results = {"approved": True,
                            "passed": True,
                            "adopted": True,
                            "true": True,
                            "false": False,
                            "failed": False,
                            True: True,
                            False: False}

            action_dict = {"ref_ctte_100": "referral-committee",
                           "intro_100": "introduction",
                           "intro_101": "introduction",
                           "pass_300": "passage",
                           "intro_110": "reading-1",
                           "refer_210": "referral-committee",
                           "crpt_301": None,
                           "crpt_317": None,
                           "concur_606": "passage",
                           "pass_301": "passage",
                           "refer_220": "referral-committee",
                           "intro_102": ["introduction", "passage"],
                           "intro_105": ["introduction", "passage"],
                           "intro_ref_ctte_100": "referral-committee",
                           "refer_209": None,
                           "intro_108": ["introduction", "passage"],
                           "intro_103": ["introduction", "passage"],
                           "msg_reso_503": "passage",
                           "intro_107": ["introduction", "passage"],
                           "imm_consid_360": "passage",
                           "refer_213": None,
                           "adopt_reso_100": "passage",
                           "adopt_reso_110": "passage",
                           "msg_507": "amendment-passage",
                           "confer_713": None,
                           "concur_603": None,
                           "confer_712": None,
                           "msg_506": "amendment-failure",
                           "receive_message_100": "passage",
                           "motion_920": None,
                           "concur_611": None,
                           "confer_735": None,
                           "third_429": None,
                           "final_501": None,
                           "concur_608": None,
                           }

            base_url = "http://search-prod.lis.state.oh.us"
            first_page = base_url
            first_page += "/solarapi/v1/general_assembly_{session}/".format(session=session)
            legislators = self.get_legislator_ids(first_page)
            all_amendments = self.get_other_data_source(first_page, base_url, "amendments")
            all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals")
            all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss")
            all_analysis = self.get_other_data_source(first_page, base_url, "analysiss")

            for row in self.get_bill_rows(session):
                spacer, number_link, _ga, title, primary_sponsor, status, spacer = row.xpath('td')

                # S.R.No.1 -> SR1
                bill_id = number_link.text_content().replace('No.', '')
                bill_id = bill_id.replace('.', '').replace(' ', '')
                # put one space back in between type and number
                bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id)

                title = title.text_content().strip()
                title = re.sub(r'^Title', '', title)

                chamber = 'lower' if 'H' in bill_id else 'upper'
                classification = 'bill' if 'B' in bill_id else 'resolution'

                bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                            title=title, classification=classification)
                bill.add_source(number_link.xpath('a/@href')[0])

                # get bill from API
                bill_api_url = ('http://search-prod.lis.state.oh.us/solarapi/v1/'
                                'general_assembly_{}/{}/{}/'.format(
                                    session,
                                    'bills' if 'B' in bill_id else 'resolutions',
                                    bill_id.lower().replace(' ', '')
                                ))
                data = self.get(bill_api_url).json()

                # add title if no short title
                if not bill.title:
                    bill.title = data['items'][0]['longtitle']
                bill.add_title(data['items'][0]['longtitle'], 'long title')

                # this stuff is version-specific
                for version in data['items']:
                    version_name = version["version"]
                    version_link = base_url+version["pdfDownloadLink"]
                    bill.add_version_link(version_name, version_link, media_type='application/pdf')

                # we'll use latest bill_version for everything else
                bill_version = data['items'][0]
                bill.add_source(bill_api_url)

                # subjects
                for subj in bill_version["subjectindexes"]:
                    try:
                        bill.add_subject(subj["primary"])
                    except KeyError:
                        pass
                    try:
                        secondary_subj = subj["secondary"]
                    except KeyError:
                        secondary_subj = ""
                    if secondary_subj:
                        bill.add_subject(secondary_subj)

                # sponsors
                sponsors = bill_version["sponsors"]
                for sponsor in sponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                                        sponsor_name,
                                        classification='primary',
                                        entity_type='person',
                                        primary=True
                        )

                cosponsors = bill_version["cosponsors"]
                for sponsor in cosponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                                         sponsor_name,
                                         classification='cosponsor',
                                         entity_type='person',
                                         primary=False,
                        )

                try:
                    action_doc = self.get(base_url+bill_version["action"][0]["link"])
                except scrapelib.HTTPError:
                    pass
                else:

                    actions = action_doc.json()
                    for action in reversed(actions["items"]):
                        actor = chamber_dict[action["chamber"]]
                        action_desc = action["description"]
                        try:
                            action_type = action_dict[action["actioncode"]]
                        except KeyError:
                            self.warning("Unknown action {desc} with code {code}."
                                         " Add it to the action_dict"
                                         ".".format(desc=action_desc,
                                                    code=action["actioncode"]))
                            action_type = None

                        date = self._tz.localize(datetime.datetime.strptime(
                                                 action["datetime"],
                                                 "%Y-%m-%dT%H:%M:%S"))
                        date = "{:%Y-%m-%d}".format(date)

                        bill.add_action(action_desc,
                                        date, chamber=actor,
                                        classification=action_type)

                # attach documents gathered earlier
                self.add_document(all_amendments, bill_id, "amendment", bill, base_url)
                self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url)
                self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url)
                self.add_document(all_analysis, bill_id, "analysis", bill, base_url)

                # votes
                vote_url = base_url+bill_version["votes"][0]["link"]
                vote_doc = self.get(vote_url)
                votes = vote_doc.json()
                yield from self.process_vote(votes, vote_url,
                                             base_url, bill, legislators,
                                             chamber_dict, vote_results)

                vote_url = base_url
                vote_url += bill_version["cmtevotes"][0]["link"]
                try:
                    vote_doc = self.get(vote_url)
                except scrapelib.HTTPError:
                    self.warning("Vote page not "
                                 "loading; skipping: {}".format(vote_url))
                    continue
                votes = vote_doc.json()
                yield from self.process_vote(votes, vote_url,
                                             base_url, bill, legislators,
                                             chamber_dict, vote_results)

                if data["items"][0]["effective_date"]:
                    effective_date = datetime.datetime.strptime(data["items"][0]["effective_date"],
                                                                "%Y-%m-%d")
                    effective_date = self._tz.localize(effective_date)
                    # the OH website adds an action that isn't in the action list JSON.
                    # It looks like:
                    # Effective 7/6/18
                    effective_date_oh = "{:%-m/%-d/%y}".format(effective_date)
                    effective_action = "Effective {}".format(effective_date_oh)
                    bill.add_action(effective_action,
                                    effective_date,
                                    chamber="executive",
                                    classification=["became-law"])

                # we have never seen a veto or a disapprove, but they seem important.
                # so we'll check and throw an error if we find one
                # life is fragile. so are our scrapers.
                if "veto" in bill_version:
                    veto_url = base_url+bill_version["veto"][0]["link"]
                    veto_json = self.get(veto_url).json()
                    if len(veto_json["items"]) > 0:
                        raise AssertionError("Whoa, a veto! We've never"
                                             " gotten one before."
                                             " Go write some code to deal"
                                             " with it: {}".format(veto_url))

                if "disapprove" in bill_version:
                    disapprove_url = base_url+bill_version["disapprove"][0]["link"]
                    disapprove_json = self.get(disapprove_url).json()
                    if len(disapprove_json["items"]) > 0:
                        raise AssertionError("Whoa, a disapprove! We've never"
                                             " gotten one before."
                                             " Go write some code to deal "
                                             "with it: {}".format(disapprove_url))

                yield bill
示例#49
0
    def scrape_chamber(self, chamber, session):
        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        bill_request = self.get(ksapi.url + 'bill_status/').text
        bill_request_json = json.loads(bill_request)
        bills = bill_request_json['content']
        for bill_data in bills:

            bill_id = bill_data['BILLNO']

            # filter other chambers
            if not bill_id.startswith(chamber_letter):
                continue

            if 'CR' in bill_id:
                btype = 'concurrent resolution'
            elif 'R' in bill_id:
                btype = 'resolution'
            elif 'B' in bill_id:
                btype = 'bill'

            title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE']

            # main
            bill = Bill(
                bill_id,
                session,
                title,
                chamber=chamber,
                classification=btype,
            )
            bill.extras = {'status': bill_data['STATUS']}

            bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

            if (bill_data['LONGTITLE']
                    and bill_data['LONGTITLE'] != bill.title):
                bill.add_title(bill_data['LONGTITLE'])

            # An "original sponsor" is the API's expression of "primary sponsor"
            for primary_sponsor in bill_data['ORIGINAL_SPONSOR']:
                bill.add_sponsorship(name=primary_sponsor,
                                     entity_type='organization' if "committee"
                                     in primary_sponsor.lower() else 'person',
                                     primary=True,
                                     classification="original sponsor")
            for sponsor in bill_data['SPONSOR_NAMES']:
                if sponsor in bill_data['ORIGINAL_SPONSOR']:
                    continue
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type='organization'
                    if "committee" in sponsor.lower() else 'person',
                    primary=False,
                    classification='cosponsor',
                )

            # history is backwards
            for event in reversed(bill_data['HISTORY']):
                actor = ('upper' if event['chamber'] == 'Senate' else 'lower')

                date = event['session_date']
                # append committee names if present
                if 'committee_names' in event:
                    action = (event['status'] + ' ' +
                              ' and '.join(event['committee_names']))
                else:
                    action = event['status']

                if event['action_code'] not in ksapi.action_codes:
                    self.warning(
                        'unknown action code on %s: %s %s' %
                        (bill_id, event['action_code'], event['status']))
                    atype = None
                else:
                    atype = ksapi.action_codes[event['action_code']]
                bill.add_action(action,
                                date,
                                chamber=actor,
                                classification=atype)

            # Versions are exposed in `bill_data['versions'],
            # but lack any descriptive text or identifiers;
            # continue to scrape these from the HTML
            yield from self.scrape_html(bill, session)

            yield bill
示例#50
0
def test_full_bill():
    create_jurisdiction()
    sp = ScrapePerson('Adam Smith')
    org = ScrapeOrganization(name='House', classification='lower')
    com = ScrapeOrganization(name='Arbitrary Committee',
                             classification='committee',
                             parent_id=org._id)

    oldbill = ScrapeBill('HB 99',
                         '1899',
                         'Axe & Tack Tax Act',
                         classification='tax bill',
                         from_organization=org._id)

    bill = ScrapeBill('HB 1',
                      '1900',
                      'Axe & Tack Tax Act',
                      classification='tax bill',
                      from_organization=org._id)
    bill.subject = ['taxes', 'axes']
    bill.add_identifier('SB 9')
    bill.add_title('Tack & Axe Tax Act')
    bill.add_action('introduced in house', '1900-04-01', chamber='lower')
    act = bill.add_action('sent to arbitrary committee',
                          '1900-04-04',
                          chamber='lower')
    act.add_related_entity('arbitrary committee', 'organization', com._id)
    bill.add_related_bill("HB 99",
                          legislative_session="1899",
                          relation_type="prior-session")
    bill.add_sponsorship('Adam Smith',
                         classification='extra sponsor',
                         entity_type='person',
                         primary=False,
                         entity_id=sp._id)
    bill.add_sponsorship('Jane Smith',
                         classification='lead sponsor',
                         entity_type='person',
                         primary=True)
    bill.add_abstract('This is an act about axes and taxes and tacks.',
                      note="official",
                      date='1969-10-20')
    bill.add_document_link('Fiscal Note',
                           'http://example.com/fn.pdf',
                           media_type='application/pdf')
    bill.add_document_link('Fiscal Note',
                           'http://example.com/fn.html',
                           media_type='text/html')
    bill.add_version_link('Fiscal Note',
                          'http://example.com/v/1',
                          media_type='text/html')
    bill.add_source('http://example.com/source')

    # import bill
    oi = OrganizationImporter('jid')
    oi.import_data([org.as_dict(), com.as_dict()])

    pi = PersonImporter('jid')
    pi.import_data([sp.as_dict()])

    BillImporter('jid', oi,
                 pi).import_data([oldbill.as_dict(),
                                  bill.as_dict()])

    # get bill from db and assert it imported correctly
    b = Bill.objects.get(identifier='HB 1')
    assert b.from_organization.classification == 'lower'
    assert b.identifier == bill.identifier
    assert b.title == bill.title
    assert b.classification == bill.classification
    assert b.subject == ['taxes', 'axes']
    assert b.abstracts.get().note == 'official'
    assert b.abstracts.get().date == '1969-10-20'

    # other_title, other_identifier added
    assert b.other_titles.get().title == 'Tack & Axe Tax Act'
    assert b.other_identifiers.get().identifier == 'SB 9'

    # actions
    actions = list(b.actions.all())
    assert len(actions) == 2
    # ensure order was preserved (if this breaks it'll be intermittent)
    assert actions[0].organization == Organization.objects.get(
        classification='lower')
    assert actions[0].description == "introduced in house"
    assert actions[1].description == "sent to arbitrary committee"
    assert (actions[1].related_entities.get().organization ==
            Organization.objects.get(classification='committee'))

    # related_bills were added
    rb = b.related_bills.get()
    assert rb.identifier == 'HB 99'

    # and bill got resolved
    assert rb.related_bill.identifier == 'HB 99'

    # sponsors added, linked & unlinked
    sponsorships = b.sponsorships.all()
    assert len(sponsorships) == 2
    person = Person.objects.get(name='Adam Smith')
    for ss in sponsorships:
        if ss.primary:
            assert ss.person is None
            assert ss.organization is None
        else:
            assert ss.person == person

    # versions & documents with their links
    versions = b.versions.all()
    assert len(versions) == 1
    assert versions[0].links.count() == 1
    documents = b.documents.all()
    assert len(documents) == 1
    assert documents[0].links.count() == 2

    # sources
    assert b.sources.count() == 1
    def scrape_details(self, bill_detail_url, session, chamber, bill_id):
        """
        Create the Bill and add the information obtained from the provided bill_detail_url.
        and then yield the bill object.
        :param bill_detail_url:
        :param session:
        :param chamber:
        :param bill_id:
        :return:
        """
        page = self.get(bill_detail_url).text

        if 'INVALID BILL NUMBER' in page:
            self.warning('INVALID BILL %s' % bill_detail_url)
            return

        doc = lxml.html.fromstring(page)
        doc.make_links_absolute(bill_detail_url)

        bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0]

        bill_type = bill_div.xpath('span/text()')[0]

        if 'General Bill' in bill_type:
            bill_type = 'bill'
        elif 'Concurrent Resolution' in bill_type:
            bill_type = 'concurrent resolution'
        elif 'Joint Resolution' in bill_type:
            bill_type = 'joint resolution'
        elif 'Resolution' in bill_type:
            bill_type = 'resolution'
        else:
            raise ValueError('unknown bill type: %s' % bill_type)

        # this is fragile, but less fragile than it was
        b = bill_div.xpath('./b[text()="Summary:"]')[0]
        bill_summary = b.getnext().tail.strip()

        bill = Bill(
            bill_id,
            legislative_session=
            session,  # session name metadata's `legislative_sessions`
            chamber=chamber,  # 'upper' or 'lower'
            title=bill_summary,
            classification=bill_type)

        subjects = list(self._subjects[bill_id])

        for subject in subjects:
            bill.add_subject(subject)

        # sponsors
        for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'):
            bill.add_sponsorship(name=sponsor,
                                 classification='primary',
                                 primary=True,
                                 entity_type='person')
        for sponsor in doc.xpath(
                '//a[contains(@href, "committee.php")]/text()'):
            sponsor = sponsor.replace(u'\xa0', ' ').strip()
            bill.add_sponsorship(name=sponsor,
                                 classification='primary',
                                 primary=True,
                                 entity_type='organization')

        # find versions
        version_url = doc.xpath('//a[text()="View full text"]/@href')[0]
        version_html = self.get(version_url).text
        version_doc = lxml.html.fromstring(version_html)
        version_doc.make_links_absolute(version_url)
        for version in version_doc.xpath('//a[contains(@href, "/prever/")]'):
            # duplicate versions with same date, use first appearance

            bill.add_version_link(
                note=version.
                text,  # Description of the version from the state;
                #  eg, 'As introduced', 'Amended', etc.
                url=version.get('href'),
                on_duplicate='ignore',
                media_type='text/html'  # Still a MIME type
            )

        # actions
        for row in bill_div.xpath('table/tr'):
            date_td, chamber_td, action_td = row.xpath('td')

            date = datetime.datetime.strptime(date_td.text, "%m/%d/%y")
            action_chamber = {
                'Senate': 'upper',
                'House': 'lower',
                None: 'legislature'
            }[chamber_td.text]

            action = action_td.text_content()
            action = action.split('(House Journal')[0]
            action = action.split('(Senate Journal')[0].strip()

            atype = action_type(action)

            bill.add_action(
                description=action,  # Action description, from the state
                date=date.strftime('%Y-%m-%d'),  # `YYYY-MM-DD` format
                chamber=action_chamber,  # 'upper' or 'lower'
                classification=atype  # Options explained in the next section
            )

        # votes
        vurl = doc.xpath('//a[text()="View Vote History"]/@href')
        if vurl:
            vurl = vurl[0]
            yield from self.scrape_vote_history(bill, vurl)

        bill.add_source(bill_detail_url)
        yield bill
示例#52
0
    def scrape_chamber(self, chamber, session):
        chamber_name = "Senate" if chamber == "upper" else "House"
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        bill_request = self.get(ksapi.url + "bill_status/").text
        bill_request_json = json.loads(bill_request)
        bills = bill_request_json["content"]
        for bill_data in bills:

            bill_id = bill_data["BILLNO"]

            # filter other chambers
            if not bill_id.startswith(chamber_letter):
                continue

            if "CR" in bill_id:
                btype = "concurrent resolution"
            elif "R" in bill_id:
                btype = "resolution"
            elif "B" in bill_id:
                btype = "bill"

            title = bill_data["SHORTTITLE"] or bill_data["LONGTITLE"]

            # main
            bill = Bill(bill_id,
                        session,
                        title,
                        chamber=chamber,
                        classification=btype)
            bill.extras = {"status": bill_data["STATUS"]}

            bill.add_source(ksapi.url + "bill_status/" + bill_id.lower())

            if bill_data["LONGTITLE"] and bill_data["LONGTITLE"] != bill.title:
                bill.add_title(bill_data["LONGTITLE"])

            # An "original sponsor" is the API's expression of "primary sponsor"
            for primary_sponsor in bill_data["ORIGINAL_SPONSOR"]:
                bill.add_sponsorship(
                    name=primary_sponsor,
                    entity_type="organization"
                    if "committee" in primary_sponsor.lower() else "person",
                    primary=True,
                    classification="original sponsor",
                )
            for sponsor in bill_data["SPONSOR_NAMES"]:
                if sponsor in bill_data["ORIGINAL_SPONSOR"]:
                    continue
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type="organization"
                    if "committee" in sponsor.lower() else "person",
                    primary=False,
                    classification="cosponsor",
                )

            # history is backwards
            for event in reversed(bill_data["HISTORY"]):
                actor = "upper" if event["chamber"] == "Senate" else "lower"

                date = event["session_date"]
                # append committee names if present
                if "committee_names" in event:
                    action = (event["status"] + " " +
                              " and ".join(event["committee_names"]))
                else:
                    action = event["status"]

                if event["action_code"] not in ksapi.action_codes:
                    self.warning(
                        "unknown action code on %s: %s %s" %
                        (bill_id, event["action_code"], event["status"]))
                    atype = None
                else:
                    atype = ksapi.action_codes[event["action_code"]]
                bill.add_action(action,
                                date,
                                chamber=actor,
                                classification=atype)

            # Versions are exposed in `bill_data['versions'],
            # but lack any descriptive text or identifiers;
            # continue to scrape these from the HTML
            yield from self.scrape_html(bill, session)

            yield bill
示例#53
0
    def scrape_bill(self, bill_num, session):
        chamber_map = {'House': 'lower', 'Senate': 'upper', 'LSO': 'executive'}
        # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45
        bill_json_url = 'http://wyoleg.gov/LsoService/api/BillInformation/{}/' \
                        '{}?calendarDate='.format(
                            session, bill_num)
        response = self.get(bill_json_url)
        bill_json = json.loads(response.content.decode('utf-8'))

        chamber = 'lower' if bill_json['bill'][0] else 'upper'

        bill = Bill(
            identifier=bill_json['bill'],
            legislative_session=session,
            title=bill_json['catchTitle'],
            chamber=chamber,
            classification="bill",
        )

        bill.add_title(bill_json['billTitle'])

        source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format(
            session, bill_json['bill'])
        bill.add_source(source_url)

        for action_json in bill_json['billActions']:
            utc_action_date = self.parse_local_date(action_json['statusDate'])

            actor = None
            if action_json['location'] and action_json[
                    'location'] in chamber_map:
                actor = chamber_map[action_json['location']]

            action = bill.add_action(
                chamber=actor,
                description=action_json['statusMessage'],
                date=utc_action_date,
                classification=categorize_action(action_json['statusMessage']),
            )

            action.extras = {
                'billInformationID': action_json['billInformationID']
            }

        if bill_json['introduced']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['introduced'])

            bill.add_version_link(
                note="Introduced",
                url=url,
                media_type="application/pdf"  # optional but useful!
            )

        if bill_json['enrolledAct']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['enrolledAct'])

            bill.add_version_link(
                note="Enrolled",
                url=url,
                media_type="application/pdf"  # optional but useful!
            )

        if bill_json['fiscalNote']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['fiscalNote'])

            bill.add_document_link(
                note="Fiscal Note",
                url=url,
                media_type="application/pdf"  # optional but useful!
            )

        if bill_json['digest']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['digest'])

            bill.add_document_link(
                note="Bill Digest",
                url=url,
                media_type="application/pdf"  # optional but useful!
            )

        if bill_json['vetoes']:
            for veto in bill_json['vetoes']:
                url = 'http://wyoleg.gov/{}'.format(veto['vetoLinkPath'])
                bill.add_version_link(
                    note=veto['vetoLinkText'],
                    url=url,
                    media_type="application/pdf"  # optional but useful!
                )

        for amendment in bill_json['amendments']:
            # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf
            url = 'http://wyoleg.gov/{}/Amends/{}.pdf'.format(
                session, amendment['amendmentNumber'])

            if amendment['sponsor'] and amendment['status']:
                title = 'Amendment {} ({}) - {} ({})'.format(
                    amendment['amendmentNumber'],
                    amendment['order'],
                    amendment['sponsor'],
                    amendment['status'],
                )
            else:
                title = 'Amendment {} ({})'.format(
                    amendment['amendmentNumber'],
                    amendment['order'],
                )
            # add versions of the bill text
            version = bill.add_version_link(
                note=title,
                url=url,
                media_type="application/pdf",
            )
            version['extras'] = {
                'amendmentNumber': amendment['amendmentNumber'],
                'sponsor': amendment['sponsor'],
            }

        for sponsor in bill_json['sponsors']:
            status = 'primary' if sponsor['primarySponsor'] else 'cosponsor'
            sponsor_type = 'person' if sponsor[
                'sponsorTitle'] else 'organization'
            bill.add_sponsorship(name=sponsor['name'],
                                 classification=status,
                                 entity_type=sponsor_type,
                                 primary=sponsor['primarySponsor'])

        if bill_json['summary']:
            bill.add_abstract(
                note="summary",
                abstract=bill_json['summary'],
            )

        if bill_json['enrolledNumber']:
            bill.extras['wy_enrolled_number'] = bill_json['enrolledNumber']

        if bill_json['chapter']:
            bill.extras['chapter'] = bill_json['chapter']

        if bill_json['effectiveDate']:
            eff = datetime.datetime.strptime(bill_json['effectiveDate'],
                                             '%m/%d/%Y')
            bill.extras['effective_date'] = eff.strftime('%Y-%m-%d')

        bill.extras['wy_bill_id'] = bill_json['id']

        for vote_json in bill_json['rollCalls']:
            yield from self.scrape_vote(bill, vote_json, session)

        yield bill
示例#54
0
    def scrape_chamber(self, chamber, session):
        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        bill_request = self.get(ksapi.url + 'bill_status/').text
        bill_request_json = json.loads(bill_request)
        bills = bill_request_json['content']
        for bill_data in bills:

            bill_id = bill_data['BILLNO']

            # filter other chambers
            if not bill_id.startswith(chamber_letter):
                continue

            if 'CR' in bill_id:
                btype = 'concurrent resolution'
            elif 'R' in bill_id:
                btype = 'resolution'
            elif 'B' in bill_id:
                btype = 'bill'

            title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE']

            # main
            bill = Bill(
                bill_id,
                session,
                title,
                chamber=chamber,
                classification=btype,
            )
            bill.extras = {'status': bill_data['STATUS']}

            bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

            if (bill_data['LONGTITLE'] and
                    bill_data['LONGTITLE'] != bill.title):
                bill.add_title(bill_data['LONGTITLE'])

            for sponsor in bill_data['SPONSOR_NAMES']:
                stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1
                         else 'cosponsor')
                if sponsor:
                    bill.add_sponsorship(
                        name=sponsor,
                        entity_type='person',
                        primary=stype == 'primary',
                        classification=stype,
                    )

            # history is backwards
            for event in reversed(bill_data['HISTORY']):
                actor = ('upper' if event['chamber'] == 'Senate'
                         else 'lower')

                date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S")
                # append committee names if present
                if 'committee_names' in event:
                    action = (event['status'] + ' ' +
                              ' and '.join(event['committee_names']))
                else:
                    action = event['status']

                if event['action_code'] not in ksapi.action_codes:
                    self.warning('unknown action code on %s: %s %s' %
                                 (bill_id, event['action_code'],
                                  event['status']))
                    atype = None
                else:
                    atype = ksapi.action_codes[event['action_code']]
                bill.add_action(
                    action, date.strftime('%Y-%m-%d'), chamber=actor, classification=atype)

            try:
                yield from self.scrape_html(bill, session)
            except scrapelib.HTTPError as e:
                self.warning('unable to fetch HTML for bill {0}'.format(
                    bill['bill_id']))

            yield bill
示例#55
0
    def bill_info(self, bill_link, session, main_url):
        bill_page = self.lxmlize(bill_link)

        long_title = self.get_node(
            bill_page,
            '//div[@class="main-content"]/div[1]/div/h2').text.split()

        bill_number = long_title[0]
        title = ''
        for x in range(2, len(long_title)):
            title += long_title[x] + ' '
        title = title[0:-1]

        if not title:
            self.error('no title, skipping %s', bill_number)
            return

        bill_type = 'resolution' if 'LR' in bill_number else 'bill'

        bill = Bill(bill_number, session, title, classification=bill_type)

        bill.add_source(main_url)
        bill.add_source(bill_link)

        introduced_by = self.get_node(
            bill_page,
            '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/a[1]/text()')

        if not introduced_by:
            introduced_by = self.get_node(
                bill_page,
                '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/text()')
            introduced_by = introduced_by.split('Introduced By:')[1].strip()

        bill.add_sponsorship(
            name=introduced_by,
            entity_type='person',
            primary=True,
            classification='primary',
        )

        action_nodes = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[5]//table/tbody/tr')

        for action_node in action_nodes:
            date = self.get_node(
                action_node,
                './td[1]').text
            date = datetime.strptime(date, '%b %d, %Y')

            # The action node may have an anchor element within it, so
            # we grab all the text within.
            action = self.get_node(
                action_node,
                './td[2]').text_content()

            if 'Governor' in action:
                actor = 'executive'
            elif 'Speaker' in action:
                actor = 'legislature'
            else:
                actor = 'legislature'

            action_type = self.action_types(action)
            bill.add_action(
                action,
                date.strftime('%Y-%m-%d'),
                chamber=actor,
                classification=action_type,
            )

        # Were in reverse chronological order.
        bill.actions.reverse()

        # Grabs bill version documents.
        version_links = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[3]/div[2]/'
            'div[@class="hidden-xs"]/ul[1]/li/a')

        for version_link in version_links:
            version_name = version_link.text
            version_url = version_link.attrib['href']
            # replace Current w/ session number
            version_url = version_url.replace('Current', session)
            bill.add_version_link(version_name, version_url, media_type='application/pdf')

        # Adds any documents related to amendments.
        amendment_links = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[5]/div[2]/table/tr/td[1]/a')

        for amendment_link in amendment_links:
            amendment_name = amendment_link.text
            amendment_url = amendment_link.attrib['href']
            bill.add_document_link(amendment_name, amendment_url)

        # Related transcripts.
        transcript_links = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[5]/div[2]/'
            'div[@class="hidden-xs"]/table/tr/td/a')

        for transcript_link in transcript_links:
            transcript_name = transcript_link.text
            transcript_url = transcript_link.attrib['href']
            bill.add_document_link(transcript_name, transcript_url)

        yield bill

        yield from self.scrape_votes(bill, bill_page, actor)
示例#56
0
    def scrape(self):
        for leg_summary in self.legislation(
                created_after=datetime.datetime(2014, 1, 1)):
            leg_type = BILL_TYPES[leg_summary['Type']]

            bill = Bill(identifier=leg_summary['File\xa0#'],
                        title=leg_summary['Title'],
                        legislative_session=None,
                        classification=leg_type,
                        from_organization={"name": "New York City Council"})
            bill.add_source(leg_summary['url'])

            leg_details = self.legDetails(leg_summary['url'])
            history = self.history(leg_summary['url'])

            bill.add_title(leg_details['Name'],
                           note='created by administrative staff')

            if 'Summary' in leg_details:
                bill.add_abstract(leg_details['Summary'], note='')

            if leg_details['Law number']:
                bill.add_identifier(leg_details['Law number'],
                                    note='law number')

            for sponsorship in self._sponsors(leg_details.get('Sponsors', [])):
                sponsor, sponsorship_type, primary = sponsorship
                bill.add_sponsorship(sponsor,
                                     sponsorship_type,
                                     'person',
                                     primary,
                                     entity_id=_make_pseudo_id(name=sponsor))

            for attachment in leg_details.get('Attachments', []):
                bill.add_document_link(attachment['label'],
                                       attachment['url'],
                                       media_type="application/pdf")

            history = list(history)

            if history:
                earliest_action = min(
                    self.toTime(action['Date']) for action in history)

                bill.legislative_session = self.sessions(earliest_action)
            else:
                bill.legislative_session = str(self.SESSION_STARTS[0])

            for action in history:
                action_description = action['Action']
                if not action_description:
                    continue

                action_class = ACTION_CLASSIFICATION[action_description]

                action_date = self.toDate(action['Date'])
                responsible_org = action['Action\xa0By']
                if responsible_org == 'City Council':
                    responsible_org = 'New York City Council'
                elif responsible_org == 'Administration':
                    responsible_org = 'Mayor'

                if responsible_org == 'Town Hall Meeting':
                    continue
                else:
                    act = bill.add_action(
                        action_description,
                        action_date,
                        organization={'name': responsible_org},
                        classification=action_class)

                if 'url' in action['Action\xa0Details']:
                    action_detail_url = action['Action\xa0Details']['url']
                    if action_class == 'committee-referral':
                        action_details = self.actionDetails(action_detail_url)
                        referred_committee = action_details[
                            'Action text'].rsplit(' to the ', 1)[-1]
                        act.add_related_entity(
                            referred_committee,
                            'organization',
                            entity_id=_make_pseudo_id(name=referred_committee))
                    result, votes = self.extractVotes(action_detail_url)
                    if votes:
                        action_vote = VoteEvent(
                            legislative_session=bill.legislative_session,
                            motion_text=action_description,
                            organization={'name': responsible_org},
                            classification=action_class,
                            start_date=action_date,
                            result=result,
                            bill=bill)
                        action_vote.add_source(action_detail_url)

                        for option, voter in votes:
                            action_vote.vote(option, voter)

                        yield action_vote

            text = self.text(leg_summary['url'])

            if text:
                bill.extras = {
                    'local_classification': leg_summary['Type'],
                    'full_text': text
                }
            else:
                bill.extras = {'local_classification': leg_summary['Type']}

            yield bill
示例#57
0
    def scrape_bill_type(self, chamber, session, bill_type, type_abbr,
                         committee_abbr_regex=get_committee_name_regex()):
        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(
            measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_id, session, title='', chamber=chamber)
            if ((bill_id.startswith('S') and chamber == 'lower') or
                    (bill_id.startswith('A') and chamber == 'upper')):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ('http://leginfo.legislature.ca.gov/faces/'
                          'billNavClient.xhtml?bill_id=%s') % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id, source_url, media_type='text/html')

            title = ''
            type_ = ['bill']
            subject = ''
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = '//caml:DigestText/xhtml:p'
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r'\s+', ' ', t)
                    t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t)
                    chunks.append(t)
                summary = '\n\n'.join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime(
                    '%m/%d/%y')
                version_name = "{} - {}".format(
                    version_date_human, version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(
                    version_name,
                    version_url_pdf,
                    media_type='application/pdf',
                    date=version_date.date())

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ('AB', 'SB'):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(version.short_title) and \
                            not version.title.lower().startswith('an act'):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == 'Yes':
                    type_.append('appropriation')

                tags = []
                if version.fiscal_committee == 'Yes':
                    tags.append('fiscal committee')
                if version.local_program == 'Yes':
                    tags.append('local program')
                if version.urgency == 'Yes':
                    tags.append('urgency')
                if version.taxlevy == 'Yes':
                    tags.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note='summary')
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras['impact_clause'] = impact_clause
            fsbill.extras['tags'] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == 'Y',
                    entity_type='person',
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {'Assembly': 'lower',
                             'Senate': 'upper'}[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:
                    def replacer(matchobj):
                        if matchobj:
                            return {'Assembly': 'lower',
                                    'Senate': 'upper'}[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r'^(Assembly|Senate)', replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r'Com[s]?. on', action.action) and not matched_abbrs:
                    msg = 'Failed to extract committee abbr from %r.'
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ('Mapping contains no committee name for '
                                   'abbreviation %r. Action text was %r.')
                            args = (abbr, action.action)
                            raise KeyError(msg % args)

                    committees = filter(None, committees)
                    kwargs['committees'] = committees

                    code = re.search(r'C[SXZ]\d+', actor)
                    if code is not None:
                        code = code.group()
                        kwargs['actor_info'] = {'committee_code': code}

                    assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace('Coms. on ', '')
                        act_str = act_str.replace('Com. on ' + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith('.'):
                            act_str = act_str + '.'

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ['upper', 'lower', 'legislature']:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = 'legislature'

                if actor != action.actor:
                    actor_info = kwargs.get('actor_info', {})
                    actor_info['details'] = action.actor
                    kwargs['actor_info'] = actor_info

                # Add strings for related legislators, if any.
                rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+'
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs['legislators'] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(act_str, date.strftime('%Y-%m-%d'), chamber=actor,
                                           classification=kwargs['classification'])
                for committee in kwargs.get('committees', []):
                    action.add_related_entity(
                        committee, entity_type='organization')
                seen_actions.add((actor, act_str, date))

            for vote_num, vote in enumerate(bill.votes):
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                if not vote.location:
                    continue

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                if vote.motion:
                    motion = vote.motion.motion_text or ''
                else:
                    motion = ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ',
                                '', motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$',
                                '', motion)
                motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                                r'Urgency Clause$',
                                '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                # XXX this is responsible for all the CA 'committee' votes, not
                # sure if that's a feature or bug, so I'm leaving it as is...
                # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                # org = {
                # 'name': vote_location,
                # 'classification': vote_classification
                # }

                fsvote = VoteEvent(
                    motion_text=motion,
                    start_date=self._tz.localize(vote.vote_date_time),
                    result='pass' if result else 'fail',
                    classification=vtype,
                    # organization=org,
                    chamber=vote_chamber,
                    bill=fsbill,
                )
                fsvote.extras = {'threshold': vote.threshold}

                source_url = (
                    'http://leginfo.legislature.ca.gov/faces'
                    '/billVotesClient.xhtml?bill_id={}'
                ).format(fsbill.identifier)
                fsvote.add_source(source_url)
                fsvote.pupa_id = source_url + '#' + str(vote_num)

                rc = {'yes': [], 'no': [], 'other': []}
                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        rc['yes'].append(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        rc['no'].append(record.legislator_name)
                    else:
                        rc['other'].append(record.legislator_name)

                # Handle duplicate votes
                for key in rc.keys():
                    rc[key] = list(set(rc[key]))

                for key, voters in rc.items():
                    for voter in voters:
                        fsvote.vote(key, voter)
                    # Set counts by summed votes for accuracy
                    fsvote.set_count(key, len(voters))

                yield fsvote

            yield fsbill
            self.session.expire_all()