Пример #1
0
    def scrape_bill(self, row, chamber, session):
        bill_id = row['LegislationNumber']

        # TODO: re-evaluate if these should be separate bills
        if 'SA' in bill_id or 'HA' in bill_id:
            self.warning('skipping amendment %s', bill_id)
            return

        bill_type = self.classify_bill(bill_id)
        bill = Bill(identifier=bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=row['LongTitle'],
                    classification=bill_type)
        if row['Synopsis']:
            bill.add_abstract(row['Synopsis'], 'synopsis')
        if row['ShortTitle']:
            bill.add_title(row['ShortTitle'], 'short title')
        if row['SponsorPersonId']:
            self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary')

        # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API?
        html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format(
            row['LegislationId']
        )
        bill.add_source(html_url, note='text/html')

        html = self.lxmlize(html_url)

        # Additional Sponsors: '//label[text()="Additional Sponsor(s):"]/following-sibling::div/a'
        additional_sponsors = html.xpath('//label[text()="Additional Sponsor(s):"]'
                                         '/following-sibling::div/a/@href')
        for sponsor_url in additional_sponsors:
            sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?'
                                             'personId=', '')
            self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary')

        # CoSponsors: '//label[text()="Co-Sponsor(s):"]/following-sibling::div/a'
        cosponsors = html.xpath('//label[text()="Additional Sponsor(s):"]/'
                                'following-sibling::div/a/@href')
        for sponsor_url in cosponsors:
            sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?'
                                             'personId=', '')
            self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor')

        versions = html.xpath('//label[text()="Original Text:"]/following-sibling::div/a/@href')
        for version_url in versions:
            media_type = self.mime_from_link(version_url)
            version_name = 'Bill Text'
            # on_duplicate='error'
            bill.add_version_link(version_name, version_url, media_type=media_type)

        fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href')
        for fiscal in fiscals:
            self.scrape_fiscal_note(bill, fiscal)

        self.scrape_actions(bill, row['LegislationId'])
        yield from self.scrape_votes(bill, row['LegislationId'], session)

        yield bill
Пример #2
0
    def scrape_bill(self, session, chamber, bill_type, url):
        bill_html = self.get(url).text
        bill_page = lxml.html.fromstring(bill_html)

        qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query))
        bill_id = "{}{}".format(qs["billtype"], qs["billnumber"])
        versions = bill_page.xpath(
            "//table[contains(@id, 'GridViewVersions')]")[0]

        metainf_table = bill_page.xpath(
            '//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
        action_table = bill_page.xpath(
            '//div[contains(@id, "UpdatePanel1")]//table[1]')[0]

        meta = self.parse_bill_metainf_table(metainf_table)

        subs = [s.strip() for s in meta["Report Title"].split(";")]
        if "" in subs:
            subs.remove("")
        b = Bill(
            bill_id,
            session,
            meta["Measure Title"],
            chamber=chamber,
            classification=bill_type,
        )
        if meta["Description"]:
            b.add_abstract(meta["Description"], "description")
        for subject in subs:
            b.add_subject(subject)
        if url:
            b.add_source(url)

        prior_session = "{} Regular Session".format(str(int(session[:4]) - 1))
        companion = meta["Companion"].strip()
        if companion:
            b.add_related_bill(
                identifier=companion.replace(u"\xa0", " "),
                legislative_session=prior_session,
                relation_type="companion",
            )
        if bill_page.xpath(
                "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()"
        ):
            prior = bill_page.xpath(
                "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()"
            )[-1]
            if "carried over" in prior.lower():
                b.add_related_bill(
                    identifier=bill_id.replace(u"\xa0", " "),
                    legislative_session=prior_session,
                    relation_type="companion",
                )
        for sponsor in meta["Introducer(s)"]:
            b.add_sponsorship(sponsor, "primary", "person", True)
        versions = self.parse_bill_versions_table(b, versions)
        yield from self.parse_bill_actions_table(b, action_table, bill_id,
                                                 session, url, chamber)
        yield b
Пример #3
0
    def scrape_bill(self, session, bill_id, chamber):
        # https://malegislature.gov/Bills/189/SD2739
        session_for_url = self.replace_non_digits(session)
        bill_url = 'https://malegislature.gov/Bills/{}/{}'.format(session_for_url, bill_id)

        try:
            response = requests.get(bill_url)
        except requests.exceptions.RequestException as e:
            self.warning(u'Server Error on {}'.format(bill_url))
            return False

        html = response.text

        page = lxml.html.fromstring(html)

        if not page.xpath('//div[contains(@class, "followable")]/h1/text()'):
            self.warning(u'Server Error on {}'.format(bill_url))
            return False

        bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[0]

        bill_id = re.sub(r'[^S|H|D|\d]', '', bill_id)

        bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                    title=bill_title, classification='bill')

        bill_summary = None
        if page.xpath('//p[@id="pinslip"]/text()'):
            bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0]
        if bill_summary:
            bill.add_abstract(bill_summary, 'summary')

        bill.add_source(bill_url)

        # https://malegislature.gov/Bills/189/SD2739 has a presenter
        # https://malegislature.gov/Bills/189/S2168 no sponsor
        # Find the non-blank text of the dt following Sponsor or Presenter,
        # including any child link text.
        sponsor = page.xpath(
            '//dt[text()="Sponsor:" or text()="Presenter:"]/'
            'following-sibling::dd/descendant-or-self::*/text()[normalize-space()]')
        if sponsor:
            sponsor = sponsor[0].strip()
            bill.add_sponsorship(sponsor, classification='primary', primary=True,
                                 entity_type='person')

        self.scrape_cosponsors(bill, bill_url)

        version = page.xpath("//div[contains(@class, 'modalBtnGroup')]/"
                             "a[contains(text(), 'Download PDF') and not(@disabled)]/@href")
        if version:
            version_url = "https://malegislature.gov{}".format(version[0])
            bill.add_version_link('Bill Text', version_url, media_type='application/pdf')

        # yield back votes and bill
        yield from self.scrape_actions(bill, bill_url, session)
        yield bill
Пример #4
0
    def scrape_bills(self, session):
        session_key = SESSION_KEYS[session]
        measures_response = self.api_client.get('measures', page=500, session=session_key)

        legislators = index_legislators(self, session_key)

        for measure in measures_response:
            bid = '{} {}'.format(measure['MeasurePrefix'], measure['MeasureNumber'])

            chamber = self.chamber_code[bid[0]]
            bill = Bill(
                bid.replace(' ', ''),
                legislative_session=session,
                chamber=chamber,
                title=measure['RelatingTo'],
                classification=self.bill_types[measure['MeasurePrefix'][1:]]
            )
            bill.add_abstract(measure['MeasureSummary'].strip(), note='summary')

            for sponsor in measure['MeasureSponsors']:
                legislator_code = sponsor['LegislatoreCode']  # typo in API
                if legislator_code:
                    try:
                        legislator = legislators[legislator_code]
                    except KeyError:
                        logger.warn('Legislator {} not found in session {}'.format(
                            legislator_code, session))
                        legislator = legislator_code
                    bill.add_sponsorship(
                        name=legislator,
                        classification={'Chief': 'primary', 'Regular': 'cosponsor'}[
                            sponsor['SponsorLevel']],
                        entity_type='person',
                        primary=True if sponsor['SponsorLevel'] == 'Chief' else False
                    )

            bill.add_source(
                "https://olis.leg.state.or.us/liz/{session}/Measures/Overview/{bid}".format(
                    session=session_key, bid=bid.replace(' ', ''))
            )
            for document in measure['MeasureDocuments']:
                # TODO: probably mixing documents & versions here - should revisit
                try:
                    bill.add_version_link(document['VersionDescription'], document['DocumentUrl'],
                                          media_type='application/pdf')
                except ValueError:
                    logger.warn('Duplicate link found for {}'.format(document['DocumentUrl']))
            for action in measure['MeasureHistoryActions']:
                classifiers = self.determine_action_classifiers(action['ActionText'])
                when = datetime.datetime.strptime(action['ActionDate'], '%Y-%m-%dT%H:%M:%S')
                when = self.tz.localize(when)
                bill.add_action(action['ActionText'], when,
                                chamber=self.chamber_code[action['Chamber']],
                                classification=classifiers)

            yield bill
Пример #5
0
    def scrape_bills(self, session):
        session_key = SESSION_KEYS[session]
        measures_response = self.api_client.get('measures', page=500, session=session_key)

        legislators = index_legislators(self, session_key)

        for measure in measures_response:
            bid = '{} {}'.format(measure['MeasurePrefix'], measure['MeasureNumber'])

            chamber = self.chamber_code[bid[0]]
            bill = Bill(
                bid.replace(' ', ''),
                legislative_session=session,
                chamber=chamber,
                title=measure['RelatingTo'],
                classification=self.bill_types[measure['MeasurePrefix'][1:]]
            )
            bill.add_abstract(measure['MeasureSummary'].strip(), note='summary')

            for sponsor in measure['MeasureSponsors']:
                legislator_code = sponsor['LegislatoreCode']  # typo in API
                if legislator_code:
                    try:
                        legislator = legislators[legislator_code]
                    except KeyError:
                        logger.warn('Legislator {} not found in session {}'.format(
                            legislator_code, session))
                        legislator = legislator_code
                    bill.add_sponsorship(
                        name=legislator,
                        classification={'Chief': 'primary', 'Regular': 'cosponsor'}[
                            sponsor['SponsorLevel']],
                        entity_type='person',
                        primary=True if sponsor['SponsorLevel'] == 'Chief' else False
                    )

            bill.add_source(
                "https://olis.leg.state.or.us/liz/{session}/Measures/Overview/{bid}".format(
                    session=session_key, bid=bid.replace(' ', ''))
            )
            for document in measure['MeasureDocuments']:
                # TODO: probably mixing documents & versions here - should revisit
                try:
                    bill.add_version_link(document['VersionDescription'], document['DocumentUrl'],
                                          media_type='application/pdf')
                except ValueError:
                    logger.warn('Duplicate link found for {}'.format(document['DocumentUrl']))
            for action in measure['MeasureHistoryActions']:
                classifiers = self.determine_action_classifiers(action['ActionText'])
                when = datetime.datetime.strptime(action['ActionDate'], '%Y-%m-%dT%H:%M:%S')
                when = self.tz.localize(when)
                bill.add_action(action['ActionText'], when,
                                chamber=self.chamber_code[action['Chamber']],
                                classification=classifiers)

            yield bill
Пример #6
0
    def scrape_bill(self, bill_url, bill_id, session_id):
        page = self.lxmlize(bill_url)

        # create bill
        title = page.xpath("//em/text()")[0]
        bill = Bill(identifier=bill_id,
                    legislative_session=session_id,
                    title=title)
        bill.add_source(bill_url, note="detail")

        # add additional fields

        data_table = page.xpath("//table[@class='data vertical_table']")[0]

        # sponsor
        sponsor_name = data_table.xpath(self.bill_table_query("Sponsor"))[0]
        bill.add_sponsorship(name=sponsor_name,
                             classification="Primary",
                             entity_type="person",
                             primary=True)

        # abstract
        try:
            summary = data_table.xpath(self.bill_table_query("Summary"))[0]
            bill.add_abstract(abstract=summary, note="summary")
            # TODO trim whitespace from summary
        except IndexError:
            print("No summary for bill {} in session {}".format(
                bill_id, session_id))

        # actions
        action_lines = data_table.xpath(self.bill_table_query("Actions"))
        for line in action_lines:
            try:
                for date_str, action_type in self.parse_actions(line):
                    bill.add_action(date=date_str,
                                    description=action_type,
                                    classification=action_type)
                    print("added action: {}".format(action_type))
            except ValueError:
                print("failed to parse these actions: {}".format([line]))

        # co-sponsors
        co_sponsors = data_table.xpath(self.bill_table_query("Co-Sponsors"))
        co_sponsors = [name.strip() for name in co_sponsors if name.strip()]
        for name in co_sponsors:
            bill.add_sponsorship(name=name,
                                 classification="co-sponsor",
                                 entity_type="person",
                                 primary=False)

        return bill
Пример #7
0
    def scrape_bill(self, session, session_slug, chamber, url):
        page = lxml.html.fromstring(self.get(url).text)
        bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip()
        # state bill id
        internal_id = re.search(r"\/Bill\/(\d+)\/Overview", url).group(1)

        # bill data gets filled in from another call
        bill_data_base = (
            "https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/"
            "FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}"
        )
        bill_data_url = bill_data_base.format(
            session_slug, internal_id, time.time() * 1000
        )

        bill_page = lxml.html.fromstring(self.get(bill_data_url).text)

        short_title = self.get_header_field(bill_page, "Summary:").text
        short_title = short_title.replace("\u00a0", " ")

        bill = Bill(
            identifier=bill_no,
            legislative_session=session,
            title=short_title,
            chamber=chamber,
        )

        long_title = self.get_header_field(bill_page, "Title:").text
        if long_title is not None:
            bill.add_abstract(long_title, "Summary")

        sponsor_div = self.get_header_field(bill_page, "Primary Sponsor")
        if sponsor_div is not None:
            self.add_sponsors(sponsor_div, bill, "primary")

        cosponsor_div = self.get_header_field(bill_page, "Co-Sponsor")
        if cosponsor_div is not None:
            self.add_sponsors(cosponsor_div, bill, "cosponsor")

        self.add_actions(bill_page, bill, chamber)
        self.add_versions(session_slug, internal_id, bill)

        bill.subject = list(set(self.subject_mapping[bill_no]))

        bdr = self.extract_bdr(short_title)
        if bdr:
            bill.extras["BDR"] = bdr

        bill.extras["NV_ID"] = internal_id

        bill.add_source(url)
        yield bill
Пример #8
0
    def scrape_bill(self, session, session_slug, chamber, url):
        page = lxml.html.fromstring(self.get(url).text)
        bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip()
        # state bill id
        internal_id = re.search(r'\/Bill\/(\d+)\/Overview', url).group(1)

        # bill data gets filled in from another call
        bill_data_base = 'https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/' \
            'FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}'
        bill_data_url = bill_data_base.format(
            session_slug, internal_id, time.time() * 1000)

        bill_page = lxml.html.fromstring(self.get(bill_data_url).text)

        short_title = self.get_header_field(bill_page, 'Summary:').text
        short_title = short_title.replace(u'\u00a0', ' ')

        bill = Bill(
            identifier=bill_no,
            legislative_session=session,
            title=short_title,
            chamber=chamber
        )

        long_title = self.get_header_field(bill_page, 'Title:').text
        if long_title is not None:
            bill.add_abstract(long_title, 'Summary')

        sponsor_div = self.get_header_field(bill_page, 'Primary Sponsor')
        if sponsor_div is not None:
            self.add_sponsors(sponsor_div, bill, 'primary')

        cosponsor_div = self.get_header_field(bill_page, 'Co-Sponsor')
        if cosponsor_div is not None:
            self.add_sponsors(cosponsor_div, bill, 'cosponsor')

        self.add_actions(bill_page, bill, chamber)
        self.add_versions(session_slug, internal_id, bill)

        bill.subject = list(set(self.subject_mapping[bill_no]))

        bdr = self.extract_bdr(short_title)
        if bdr:
            bill.extras['BDR'] = bdr

        bill.extras['NV_ID'] = internal_id

        bill.add_source(url)
        yield bill
Пример #9
0
    def scrape_bill(self, session, chamber, bill_type, url):
        bill_html = self.get(url).text
        bill_page = lxml.html.fromstring(bill_html)

        qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query))
        bill_id = '{}{}'.format(qs['billtype'], qs['billnumber'])
        versions = bill_page.xpath("//table[contains(@id, 'GridViewVersions')]")[0]

        metainf_table = bill_page.xpath('//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
        action_table = bill_page.xpath('//div[contains(@id, "UpdatePanel1")]//table[1]')[0]

        meta = self.parse_bill_metainf_table(metainf_table)

        subs = [s.strip() for s in meta['Report Title'].split(";")]
        if "" in subs:
            subs.remove("")
        b = Bill(bill_id, session, meta['Measure Title'],
                 chamber=chamber,
                 classification=bill_type)
        if meta['Description']:
            b.add_abstract(meta['Description'], 'description')
        for subject in subs:
            b.add_subject(subject)
        if url:
            b.add_source(url)

        prior_session = '{} Regular Session'.format(str(int(session[:4]) - 1))
        companion = meta['Companion'].strip()
        if companion:
            b.add_related_bill(identifier=companion.replace(u'\xa0', ' '),
                               legislative_session=prior_session,
                               relation_type="companion")
        prior = bill_page.xpath(
            "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()")[-1]
        if 'carried over' in prior.lower():
            b.add_related_bill(identifier=bill_id.replace(u'\xa0', ' '),
                               legislative_session=prior_session,
                               relation_type="companion")
        for sponsor in meta['Introducer(s)']:
            b.add_sponsorship(sponsor, 'primary', 'person', True)
        versions = self.parse_bill_versions_table(b, versions)
        yield from self.parse_bill_actions_table(b, action_table, bill_id, session, url, chamber)
        yield b
Пример #10
0
    def scrape_bill(self, session, chamber, bill_url):

        try:
            page = self.lxmlize('{}{}'.format(CO_URL_BASE, bill_url))
        except scrapelib.HTTPError as e:
            if e.response.status_code == 503:
                self.error('Skipping %s w/ 503', bill_url)
                return
            else:
                raise

        bill_number = page.xpath(
            '//div[contains(@class,"field-name-field-bill-number")]'
            '//div[contains(@class,"field-item even")][1]/text()')[0].strip()

        bill_title = page.xpath('//span[@property="dc:title"]/@content')[0]

        bill_summary = page.xpath(
            'string(//div[contains(@class,"field-name-field-bill-summary")])')
        bill_summary = bill_summary.strip()
        bill = Bill(
            bill_number,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
        )
        if bill_summary:
            bill.add_abstract(bill_summary, 'summary')
        bill.add_source('{}{}'.format(CO_URL_BASE, bill_url))

        self.scrape_sponsors(bill, page)
        self.scrape_actions(bill, page)
        self.scrape_versions(bill, page)
        self.scrape_research_notes(bill, page)
        self.scrape_fiscal_notes(bill, page)
        self.scrape_committee_report(bill, page)
        self.scrape_amendments(bill, page)
        yield bill
        yield from self.scrape_votes(bill, page)
Пример #11
0
    def scrape_bill(self, session, chamber, bill_url):

        try:
            page = self.lxmlize('{}{}'.format(CO_URL_BASE, bill_url))
        except scrapelib.HTTPError as e:
            if e.response.status_code == 503:
                self.error('Skipping %s w/ 503', bill_url)
                return
            else:
                raise

        bill_number = page.xpath('//div[contains(@class,"field-name-field-bill-number")]'
                                 '//div[contains(@class,"field-item even")][1]/text()')[0].strip()

        bill_title = page.xpath('//span[@property="dc:title"]/@content')[0]

        bill_summary = page.xpath(
            'string(//div[contains(@class,"field-name-field-bill-summary")])')
        bill_summary = bill_summary.strip()
        bill = Bill(
                    bill_number,
                    legislative_session=session,
                    chamber=chamber,
                    title=bill_title,
            )
        if bill_summary:
            bill.add_abstract(bill_summary, 'summary')
        bill.add_source('{}{}'.format(CO_URL_BASE, bill_url))

        self.scrape_sponsors(bill, page)
        self.scrape_actions(bill, page)
        self.scrape_versions(bill, page)
        self.scrape_research_notes(bill, page)
        self.scrape_fiscal_notes(bill, page)
        self.scrape_committee_report(bill, page)
        self.scrape_amendments(bill, page)
        yield bill
        yield from self.scrape_votes(bill, page)
Пример #12
0
    def scrape_bill_2012(self, chamber, session, bill_id, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        # find <a name="Title">, get parent dt, get parent dl, then dd n dl
        title = doc.xpath(
            '//a[@name="Title"][1]/../../dd[1]/text()')[0].strip()

        summary = doc.xpath('//font[@size="3"]/p/text()')[0].strip()

        if 'B' in bill_id:
            _type = ['bill']
        elif 'J' in bill_id:
            _type = ['joint resolution']

        bill = Bill(
            bill_id,
            legislative_session=session,
            classification=_type,
            chamber=chamber,
            title=title,
        )
        bill.add_abstract(summary, note='summary')
        bill.add_source(url)

        self.parse_bill_sponsors(doc, bill)  # sponsors
        self.parse_bill_actions(doc, bill)  # actions
        self.parse_bill_documents(doc, bill)  # documents and versions
        yield from self.parse_bill_votes(doc, bill)  # votes

        # subjects
        subjects = []
        for subj in doc.xpath('//a[contains(@href, "/subjects/")]'):
            subjects.append(subj.text.split('-see also-')[0])
        bill.subject = subjects

        # add bill to collection
        self.save_bill(bill)
Пример #13
0
    def scrape_bill_2012(self, chamber, session, bill_id, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        # find <a name="Title">, get parent dt, get parent dl, then dd n dl
        title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip()

        summary = doc.xpath('//font[@size="3"]/p/text()')[0].strip()

        if 'B' in bill_id:
            _type = ['bill']
        elif 'J' in bill_id:
            _type = ['joint resolution']

        bill = Bill(
            bill_id,
            legislative_session=session,
            classification=_type,
            chamber=chamber,
            title=title,
        )
        bill.add_abstract(summary, note='summary')
        bill.add_source(url)

        self.parse_bill_sponsors(doc, bill)     # sponsors
        self.parse_bill_actions(doc, bill)      # actions
        self.parse_bill_documents(doc, bill)    # documents and versions
        yield from self.parse_bill_votes(doc, bill)        # votes

        # subjects
        subjects = []
        for subj in doc.xpath('//a[contains(@href, "/subjects/")]'):
            subjects.append(subj.text.split('-see also-')[0])
        bill.subject = subjects

        # add bill to collection
        self.save_bill(bill)
Пример #14
0
	def scrape_bill(self, bill_url, bill_id, session_id):
		page = self.lxmlize(bill_url)
		# create bill
		title = page.xpath("//h1/text()")[0]
		bill = Bill(identifier=bill_id,
			        legislative_session=session_id,
			        title=title)
		bill.add_source(bill_url, note="detail")

		# add additional fields

		# abstract
		try:
			# abstract is directly above <h2>Legislative History</h2>
			leg_his = page.xpath("//h2[text()='Legislative History']")[0]
			abstract = leg_his.xpath("preceding-sibling::p/text()")[0]
			bill.add_abstract(abstract=abstract.strip(), note="summary")
			# TODO trim whitespace from summary
		except IndexError:
			print("No abstract for bill {} in session {}".format(bill_id, session_id))

		# the rest of the fields are found inside this <table>
		data_table = page.xpath("//table[contains(@class, 'data')]")[0]

		# sponsor
		sponsor_name = data_table.xpath(self.bill_table_query("Sponsor") + "/text()")[0]
		bill.add_sponsorship(name=sponsor_name,
				classification="Primary",
				entity_type="person",
				primary=True
				)

		# actions
		action_lines = data_table.xpath(self.bill_table_query("Actions") + "/text()")
		for line in action_lines:
			line = line.join('')
			try:
				for date_str, action_type in self.parse_actions(line):
					bill.add_action(date=date_str,
						description=action_type,	
						classification=action_type)
			except ValueError:
				print("failed to parse these actions: {}".format([line]))


		# co-sponsors
		co_sponsors = data_table.xpath(self.bill_table_query("Co-Sponsors") + "/text()")
		co_sponsors = [name.strip() for name in co_sponsors if name.strip()]
		for name in co_sponsors:
			bill.add_sponsorship(name=name,
						classification="co-sponsor",
						entity_type="person",
						primary=False)

		# committee (stored as another sponsorship in OCD)
		committees = data_table.xpath(self.bill_table_query("Committee") + "/a/text()")
		for comm in committees:
			bill.add_sponsorship(name=comm,
							classification="secondary", # classification ?
							entity_type="organization",
							primary=False)

		return bill
Пример #15
0
    def get_bill(self, matter):
        '''Make Bill object from given matter.'''
        matter_id = matter['MatterId']
        if matter_id in DUPLICATED_ACTIONS:
            return None

        date = matter['MatterIntroDate']
        title = matter['MatterName']
        identifier = matter['MatterFile']

        if not all((date, title, identifier)):
            return None

        leg_type = BILL_TYPES[matter['MatterTypeName']]

        bill_session = self.sessions(self.toTime(date))

        bill = Bill(identifier=identifier,
                    title=title,
                    classification=leg_type,
                    legislative_session=bill_session,
                    from_organization={"name": "New York City Council"})

        legistar_web = matter['legistar_url']
        legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

        bill.add_source(legistar_web, note='web')
        bill.add_source(legistar_api, note='api')

        if matter['MatterTitle']:
            bill.add_title(matter['MatterTitle'])

        if matter['MatterEXText5']:
            bill.add_abstract(matter['MatterEXText5'], note='')

        try:
            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        for attachment in self.attachments(matter_id):

            if attachment['MatterAttachmentId'] == 103315:  # Duplicate
                return None

            if attachment['MatterAttachmentName']:
                bill.add_document_link(attachment['MatterAttachmentName'],
                                       attachment['MatterAttachmentHyperlink'],
                                       media_type='application/pdf')

        for topic in self.topics(matter_id):
            bill.add_subject(topic['MatterIndexName'].strip())

        for relation in self.relations(matter_id):
            try:
                related_bill = self.endpoint(
                    '/matters/{0}', relation['MatterRelationMatterId'])
            except scrapelib.HTTPError:
                return None
            else:
                date = related_bill['MatterIntroDate']
                related_bill_session = self.session(self.toTime(date))
                identifier = related_bill['MatterFile']
                bill.add_related_bill(identifier=identifier,
                                      legislative_session=related_bill_session,
                                      relation_type='companion')

        try:
            text = self.text(matter_id)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        bill.extras['local_classification'] = matter['MatterTypeName']

        if text:
            if text['MatterTextPlain']:
                bill.extras['plain_text'] = text['MatterTextPlain'].replace(
                    u'\u0000', '')

            if text['MatterTextRtf']:
                bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                    u'\u0000', '')

        return bill
Пример #16
0
    def _parse_senate_billpage(self, bill_url, year):
        bill_page = self.lxmlize(bill_url)

        # get all the info needed to record the bill
        # TODO probably still needs to be fixed
        bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content()
        bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content()
        bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content()
        # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()

        bill_type = "bill"
        triplet = bill_id[:3]
        if triplet in bill_types:
            bill_type = bill_types[triplet]

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.info("With subjects for this bill")

        self.info(bid)

        bill = Bill(
            bill_id,
            title=bill_desc,
            legislative_session=year,
            classification=bill_type,
        )
        bill.subject = subs
        bill.add_abstract(bill_desc, note='abstract')
        bill.add_source(bill_url)

        if bill_title:
            bill.add_title(bill_title)

        # Get the primary sponsor
        sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0]
        bill_sponsor = sponsor.text_content()
        # bill_sponsor_link = sponsor.attrib.get('href')
        bill.add_sponsorship(
            bill_sponsor,
            entity_type='person',
            classification='primary',
            primary=True,
        )

        # cosponsors show up on their own page, if they exist
        cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]')
        if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'):
            self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href'])

        # get the actions
        action_url = bill_page.xpath('//a[@id="hlAllActions"]')
        if len(action_url) > 0:
            action_url = action_url[0].attrib['href']
            self._parse_senate_actions(bill, action_url)

        # stored on a separate page
        versions_url = bill_page.xpath('//a[@id="hlFullBillText"]')
        if len(versions_url) > 0 and versions_url[0].attrib.get('href'):
            self._parse_senate_bill_versions(bill, versions_url[0].attrib['href'])

        yield bill
Пример #17
0
    def scrape_bill(self, row, session):
        bill_id = row["LegislationDisplayCode"]

        amendment = None
        substitute = None

        if bill_id.count(" ") > 1:
            if " w/ " in bill_id:
                self.info("Found amended bill `{}`".format(bill_id))
                bill_id, amendment = bill_id.split(" w/ ")
            # A bill can _both_ be amended and be substituted
            if " for " in bill_id:
                self.info("Found substitute to use instead: `{}`".format(bill_id))
                substitute, bill_id = bill_id.split(" for ")
            if amendment is None and substitute is None:
                raise ValueError("unknown bill_id format: " + bill_id)

        bill_type = self.classify_bill(bill_id)
        chamber = "upper" if bill_id.startswith("S") else "lower"

        bill = Bill(
            identifier=bill_id,
            legislative_session=session,
            chamber=chamber,
            title=row["LongTitle"],
            classification=bill_type,
        )
        if row["Synopsis"]:
            bill.add_abstract(row["Synopsis"], "synopsis")
        if row["ShortTitle"]:
            bill.add_title(row["ShortTitle"], "short title")
        if row["SponsorPersonId"]:
            self.add_sponsor_by_legislator_id(bill, row["SponsorPersonId"], "primary")
        if substitute:
            bill.extras["substitute"] = substitute
        if amendment:
            bill.extras["amendment"] = amendment

        # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API?
        html_url = "https://legis.delaware.gov/BillDetail?LegislationId={}".format(
            row["LegislationId"]
        )
        bill.add_source(html_url, note="text/html")

        html = self.lxmlize(html_url)

        additional_sponsors = html.xpath(
            '//label[text()="Additional Sponsor(s):"]' "/following-sibling::div/a/@href"
        )
        for sponsor_url in additional_sponsors:
            sponsor_id = sponsor_url.replace(
                "https://legis.delaware.gov/LegislatorDetail?" "personId=", ""
            )
            self.add_sponsor_by_legislator_id(bill, sponsor_id, "primary")

        cosponsors = html.xpath(
            '//label[text()="Co-Sponsor(s):"]/' "following-sibling::div/a/@href"
        )
        for sponsor_url in cosponsors:
            sponsor_id = sponsor_url.replace(
                "https://legis.delaware.gov/LegislatorDetail?" "personId=", ""
            )
            self.add_sponsor_by_legislator_id(bill, sponsor_id, "cosponsor")

        versions = html.xpath(
            '//label[text()="Original Text:"]/following-sibling::div/a/@href'
        )
        for version_url in versions:
            media_type = self.mime_from_link(version_url)
            version_name = "Bill Text"
            bill.add_version_link(version_name, version_url, media_type=media_type)

        fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href')
        for fiscal in fiscals:
            self.scrape_fiscal_note(bill, fiscal)

        self.scrape_actions(bill, row["LegislationId"])

        if row["HasAmendments"] is True:
            self.scrape_amendments(bill, row["LegislationId"])

        yield from self.scrape_votes(bill, row["LegislationId"], session)

        yield bill
Пример #18
0
    def scrape(self, session=None, chamber=None):
        bill_type_map = {
            "B": "bill",
            "R": "resolution",
            "JR": "joint resolution",
            "CR": "concurrent resolution",
        }

        chamber_map = {
            "H": "lower",
            "S": "upper",
            "J": "joint",
            "E": "legislature",  # Effective date
        }

        action_code_map = {
            "HI": None,
            "SI": None,
            "HH": None,
            "SH": None,
            "HPF": ["introduction"],
            "HDSAS": None,
            "SPF": ["introduction"],
            "HSR": ["reading-2"],
            "SSR": ["reading-2"],
            "HFR": ["reading-1"],
            "SFR": ["reading-1"],
            "HRECM": ["withdrawal", "referral-committee"],
            "SRECM": ["withdrawal", "referral-committee"],
            "SW&C": ["withdrawal", "referral-committee"],
            "HW&C": ["withdrawal", "referral-committee"],
            "HRA": ["passage"],
            "SRA": ["passage"],
            "HPA": ["passage"],
            "HRECO": None,
            "SPA": ["passage"],
            "HTABL": None,  # 'House Tabled' - what is this?
            "SDHAS": None,
            "HCFR": ["committee-passage-favorable"],
            "SCFR": ["committee-passage-favorable"],
            "HRAR": ["referral-committee"],
            "SRAR": ["referral-committee"],
            "STR": ["reading-3"],
            "SAHAS": None,
            "SE": ["passage"],
            "SR": ["referral-committee"],
            "HTRL": ["reading-3", "failure"],
            "HTR": ["reading-3"],
            "S3RLT": ["reading-3", "failure"],
            "HASAS": None,
            "S3RPP": None,
            "STAB": None,
            "SRECO": None,
            "SAPPT": None,
            "HCA": None,
            "HNOM": None,
            "HTT": None,
            "STT": None,
            "SRECP": None,
            "SCRA": None,
            "SNOM": None,
            "S2R": ["reading-2"],
            "H2R": ["reading-2"],
            "SENG": ["passage"],
            "HENG": ["passage"],
            "HPOST": None,
            "HCAP": None,
            "SDSG": ["executive-signature"],
            "SSG": ["executive-receipt"],
            "Signed Gov": ["executive-signature"],
            "HDSG": ["executive-signature"],
            "HSG": ["executive-receipt"],
            "EFF": None,
            "HRP": None,
            "STH": None,
            "HTS": None,
        }

        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)
        sid = SESSION_SITE_IDS[session]

        legislation = backoff(self.lservice.GetLegislationForSession,
                              sid)["LegislationIndex"]

        for leg in legislation:
            lid = leg["Id"]
            instrument = backoff(self.lservice.GetLegislationDetail, lid)
            history = [x for x in instrument["StatusHistory"][0]]

            actions = reversed([{
                "code": x["Code"],
                "action": x["Description"],
                "_guid": x["Id"],
                "date": x["Date"],
            } for x in history])

            guid = instrument["Id"]

            # A little bit hacky.
            bill_prefix = instrument["DocumentType"]
            bill_chamber = chamber_map[bill_prefix[0]]
            bill_type = bill_type_map[bill_prefix[1:]]

            bill_id = "%s %s" % (bill_prefix, instrument["Number"])
            if instrument["Suffix"]:
                bill_id += instrument["Suffix"]

            title = instrument["Caption"]
            description = instrument["Summary"]

            if title is None:
                continue

            bill = Bill(
                bill_id,
                legislative_session=session,
                chamber=bill_chamber,
                title=title,
                classification=bill_type,
            )
            bill.add_abstract(description, note="description")
            bill.extras = {"guid": guid}

            if instrument["Votes"]:
                for vote_ in instrument["Votes"]:
                    _, vote_ = vote_
                    vote_ = backoff(self.vservice.GetVote, vote_[0]["VoteId"])

                    vote = VoteEvent(
                        start_date=vote_["Date"].strftime("%Y-%m-%d"),
                        motion_text=vote_["Caption"] or "Vote on Bill",
                        chamber={
                            "House": "lower",
                            "Senate": "upper"
                        }[vote_["Branch"]],
                        result="pass"
                        if vote_["Yeas"] > vote_["Nays"] else "fail",
                        classification="passage",
                        bill=bill,
                    )
                    vote.set_count("yes", vote_["Yeas"])
                    vote.set_count("no", vote_["Nays"])
                    vote.set_count("other",
                                   vote_["Excused"] + vote_["NotVoting"])

                    vote.add_source(self.vsource)

                    methods = {"Yea": "yes", "Nay": "no"}

                    for vdetail in vote_["Votes"][0]:
                        whom = vdetail["Member"]
                        how = vdetail["MemberVoted"]
                        if whom["Name"] == "VACANT":
                            continue
                        name, district = vote_name_pattern.search(
                            whom["Name"]).groups()
                        vote.vote(methods.get(how, "other"),
                                  name,
                                  note=district)

                    yield vote

            ccommittees = defaultdict(list)
            committees = instrument["Committees"]
            if committees:
                for committee in committees[0]:
                    ccommittees[{
                        "House": "lower",
                        "Senate": "upper"
                    }[committee["Type"]]].append(committee["Name"])

            for action in actions:
                action_chamber = chamber_map[action["code"][0]]

                try:
                    action_types = action_code_map[action["code"]]
                except KeyError:
                    error_msg = "Code {code} for action {action} not recognized.".format(
                        code=action["code"], action=action["action"])

                    self.logger.warning(error_msg)

                    action_types = None

                committees = []
                if action_types and any(
                    ("committee" in x for x in action_types)):
                    committees = [
                        str(x) for x in ccommittees.get(action_chamber, [])
                    ]

                act = bill.add_action(
                    action["action"],
                    action["date"].strftime("%Y-%m-%d"),
                    classification=action_types,
                    chamber=action_chamber,
                )
                for committee in committees:
                    act.add_related_entity(committee, "organization")
                act.extras = {"code": action["code"], "guid": action["_guid"]}

            sponsors = []
            if instrument["Authors"]:
                sponsors = instrument["Authors"]["Sponsorship"]
                if "Sponsors" in instrument and instrument["Sponsors"]:
                    sponsors += instrument["Sponsors"]["Sponsorship"]

            sponsors = [(x["Type"], self.get_member(x["MemberId"]))
                        for x in sponsors]

            for typ, sponsor in sponsors:
                name = "{First} {Last}".format(**dict(sponsor["Name"]))
                bill.add_sponsorship(
                    name,
                    entity_type="person",
                    classification="primary"
                    if "Author" in typ else "secondary",
                    primary="Author" in typ,
                )

            for version in instrument["Versions"]["DocumentDescription"]:
                name, url, doc_id, version_id = [
                    version[x]
                    for x in ["Description", "Url", "Id", "Version"]
                ]
                link = bill.add_version_link(name,
                                             url,
                                             media_type="application/pdf")
                link["extras"] = {
                    "_internal_document_id": doc_id,
                    "_version_id": version_id,
                }

            bill.add_source(self.msource)
            bill.add_source(self.lsource)
            bill.add_source(
                SOURCE_URL.format(**{
                    "session": session,
                    "bid": guid
                }))

            yield bill
Пример #19
0
    def scrape_bill(self, chamber, session, doc_type, url, bill_type=None):
        try:
            html = self.get(url).text
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(url)
        except scrapelib.HTTPError as e:
            assert (
                "500" in e.args[0]
            ), "Unexpected error when accessing page: {}".format(e)
            self.warning("500 error for bill page; skipping bill")
            return

        # bill id, title, summary
        bill_num = re.findall(r"DocNum=(\d+)", url)[0]
        bill_type = bill_type or DOC_TYPES[doc_type[1:]]
        bill_id = doc_type + bill_num

        title = doc.xpath(
            '//span[text()="Short Description:"]/following-sibling::span[1]/' "text()"
        )[0].strip()
        summary = doc.xpath(
            '//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/'
            "text()"
        )[0].strip()

        bill = Bill(
            identifier=bill_id,
            legislative_session=session,
            title=title,
            classification=bill_type,
            chamber=chamber,
        )

        bill.add_abstract(summary, note="")

        bill.add_source(url)
        # sponsors
        sponsor_list = build_sponsor_list(doc.xpath('//a[@class="content"]'))
        # don't add just yet; we can make them better using action data

        committee_actors = {}

        # actions
        action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td')
        for date, actor, action_elem in group(action_tds, 3):
            date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y")
            date = self.localize(date).date()
            actor = actor.text_content()
            if actor == "House":
                actor_id = {"classification": "lower"}
            elif actor == "Senate":
                actor_id = {"classification": "upper"}

            action = action_elem.text_content()
            classification, related_orgs = _categorize_action(action)

            # if related_orgs and any(c.startswith("committee") for c in classification):
            #     ((name, source),) = [
            #         (a.text, a.get("href"))
            #         for a in action_elem.xpath("a")
            #         if "committee" in a.get("href")
            #     ]
            #     source = canonicalize_url(source)
            #     actor_id = {"sources__url": source, "classification": "committee"}
            #     committee_actors[source] = name

            bill.add_action(
                action,
                date,
                organization=actor_id,
                classification=classification,
                related_entities=related_orgs,
            )

            if action.lower().find("sponsor") != -1:
                self.refine_sponsor_list(actor, action, sponsor_list, bill_id)

        # now add sponsors
        for spontype, sponsor, chamber, official_type in sponsor_list:
            if official_type == "primary":
                primary = True
            else:
                primary = False
            if chamber:
                bill.add_sponsorship(
                    sponsor, spontype, "person", primary=primary, chamber=chamber
                )
            else:
                bill.add_sponsorship(spontype, sponsor, "person", primary=primary)

        # versions
        version_url = doc.xpath('//a[text()="Full Text"]/@href')[0]
        self.scrape_documents(bill, version_url)
        yield bill

        votes_url = doc.xpath('//a[text()="Votes"]/@href')[0]
        yield from self.scrape_votes(session, bill, votes_url, committee_actors)
Пример #20
0
    def scrape_bill(self, row, chamber, session):
        bill_id = row['LegislationDisplayCode']

        # hack for empty StatusName
        statusless_bills = ['HA 2 to SS 1 for SB 5', 'HA 3 to SS 1 for SB 5']
        is_force_substitute = bill_id in statusless_bills \
            and row['StatusName'] is None

        is_substituted = is_force_substitute or 'Substituted' in row['StatusName'] \

        if is_substituted:
            # skip substituted bills, the replacement is picked up instead
            self.warning('skipping %s: %s', bill_id, row['StatusName'])
            return

        substitute = None

        if bill_id.count(' ') > 1:
            if 'w/' in bill_id or 'SA' in bill_id or 'HA' in bill_id:
                # TODO: re-evaluate if these should be separate bills
                self.warning('skipping amendment %s', bill_id)
                return
            elif ' for ' in bill_id:
                self.info(
                    "Found substitute to use instead: `{}`".format(bill_id))
                substitute, bill_id = bill_id.split(' for ')
            else:
                raise ValueError('unknown bill_id format: ' + bill_id)

        bill_type = self.classify_bill(bill_id)

        bill = Bill(identifier=bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=row['LongTitle'],
                    classification=bill_type)
        if row['Synopsis']:
            bill.add_abstract(row['Synopsis'], 'synopsis')
        if row['ShortTitle']:
            bill.add_title(row['ShortTitle'], 'short title')
        if row['SponsorPersonId']:
            self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'],
                                              'primary')
        if substitute:
            bill.extras['substitute'] = substitute

        # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API?
        html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format(
            row['LegislationId'])
        bill.add_source(html_url, note='text/html')

        html = self.lxmlize(html_url)

        # Additional Sponsors: '//label[text()="Additional Sponsor(s):"]/following-sibling::div/a'
        additional_sponsors = html.xpath(
            '//label[text()="Additional Sponsor(s):"]'
            '/following-sibling::div/a/@href')
        for sponsor_url in additional_sponsors:
            sponsor_id = sponsor_url.replace(
                'https://legis.delaware.gov/LegislatorDetail?'
                'personId=', '')
            self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary')

        # CoSponsors: '//label[text()="Co-Sponsor(s):"]/following-sibling::div/a'
        cosponsors = html.xpath('//label[text()="Additional Sponsor(s):"]/'
                                'following-sibling::div/a/@href')
        for sponsor_url in cosponsors:
            sponsor_id = sponsor_url.replace(
                'https://legis.delaware.gov/LegislatorDetail?'
                'personId=', '')
            self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor')

        versions = html.xpath(
            '//label[text()="Original Text:"]/following-sibling::div/a/@href')
        for version_url in versions:
            media_type = self.mime_from_link(version_url)
            version_name = 'Bill Text'
            # on_duplicate='error'
            bill.add_version_link(version_name,
                                  version_url,
                                  media_type=media_type)

        fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href')
        for fiscal in fiscals:
            self.scrape_fiscal_note(bill, fiscal)

        self.scrape_actions(bill, row['LegislationId'])
        yield from self.scrape_votes(bill, row['LegislationId'], session)

        yield bill
Пример #21
0
    def scrape_bill(self, chamber, session, doc_type, url, bill_type=None):
        try:
            html = self.get(url).text
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(url)
        except scrapelib.HTTPError as e:
            assert '500' in e.args[0], "Unexpected error when accessing page: {}".format(e)
            self.warning("500 error for bill page; skipping bill")
            return

        # bill id, title, summary
        bill_num = re.findall('DocNum=(\d+)', url)[0]
        bill_type = bill_type or DOC_TYPES[doc_type[1:]]
        bill_id = doc_type + bill_num

        title = doc.xpath('//span[text()="Short Description:"]/following-sibling::span[1]/'
                          'text()')[0].strip()
        summary = doc.xpath('//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/'
                            'text()')[0].strip()

        bill = Bill(identifier=bill_id,
                    legislative_session=session,
                    title=title,
                    classification=bill_type,
                    chamber=chamber)

        bill.add_abstract(summary, note='')

        bill.add_source(url)
        # sponsors
        sponsor_list = build_sponsor_list(doc.xpath('//a[@class="content"]'))
        # don't add just yet; we can make them better using action data

        # actions
        action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td')
        for date, actor, action_elem in group(action_tds, 3):
            date = datetime.datetime.strptime(date.text_content().strip(),
                                              "%m/%d/%Y")
            date = self.localize(date).date()
            actor = actor.text_content()
            if actor == 'House':
                actor_id = {'classification': 'lower'}
            elif actor == 'Senate':
                actor_id = {'classification': 'upper'}

            action = action_elem.text_content()
            classification, related_orgs = _categorize_action(action)

            if (related_orgs and any(c.startswith('committee') for c in classification)):
                source, = [a.get('href') for a in
                           action_elem.xpath('a')
                           if 'committee' in a.get('href')]
                actor_id = {'sources__url': canonicalize_url(source),
                            'classification': 'committee'}

            bill.add_action(action, date,
                            organization=actor_id,
                            classification=classification,
                            related_entities=related_orgs)

            if action.lower().find('sponsor') != -1:
                self.refine_sponsor_list(actor, action, sponsor_list, bill_id)

        # now add sponsors
        for spontype, sponsor, chamber, official_type in sponsor_list:
            if official_type == 'primary':
                primary = True
            else:
                primary = False
            if chamber:
                bill.add_sponsorship(sponsor, spontype, 'person',
                                     primary=primary,
                                     chamber=chamber)
            else:
                bill.add_sponsorship(spontype, sponsor, 'person',
                                     primary=primary)

        # versions
        version_url = doc.xpath('//a[text()="Full Text"]/@href')[0]
        self.scrape_documents(bill, version_url)

        votes_url = doc.xpath('//a[text()="Votes"]/@href')[0]
        votes = self.scrape_votes(session, bill, votes_url)

        return bill, votes
Пример #22
0
    def scrape(self, session=None, chamber=None):
        bill_type_map = {
            'B': 'bill',
            'R': 'resolution',
            'JR': 'joint resolution',
            'CR': 'concurrent resolution',
        }

        chamber_map = {
            'H': 'lower',
            'S': 'upper',
            'J': 'joint',
            'E': 'legislature',  # Effective date
        }

        action_code_map = {
            'HI': None,
            'SI': None,
            'HH': None,
            'SH': None,
            'HPF': ['introduction'],
            'HDSAS': None,
            'SPF': ['introduction'],
            'HSR': ['reading-2'],
            'SSR': ['reading-2'],
            'HFR': ['reading-1'],
            'SFR': ['reading-1'],
            'HRECM': ['withdrawal', 'referral-committee'],
            'SRECM': ['withdrawal', 'referral-committee'],
            'SW&C': ['withdrawal', 'referral-committee'],
            'HW&C': ['withdrawal', 'referral-committee'],
            'HRA': ['passage'],
            'SRA': ['passage'],
            'HPA': ['passage'],
            'HRECO': None,
            'SPA': ['passage'],
            'HTABL': None,  # 'House Tabled' - what is this?
            'SDHAS': None,
            'HCFR': ['committee-passage-favorable'],
            'SCFR': ['committee-passage-favorable'],
            'HRAR': ['referral-committee'],
            'SRAR': ['referral-committee'],
            'STR': ['reading-3'],
            'SAHAS': None,
            'SE': ['passage'],
            'SR': ['referral-committee'],
            'HTRL': ['reading-3', 'failure'],
            'HTR': ['reading-3'],
            'S3RLT': ['reading-3', 'failure'],
            'HASAS': None,
            'S3RPP': None,
            'STAB': None,
            'SRECO': None,
            'SAPPT': None,
            'HCA': None,
            'HNOM': None,
            'HTT': None,
            'STT': None,
            'SRECP': None,
            'SCRA': None,
            'SNOM': None,
            'S2R': ['reading-2'],
            'H2R': ['reading-2'],
            'SENG': ['passage'],
            'HENG': ['passage'],
            'HPOST': None,
            'HCAP': None,
            'SDSG': ['executive-signature'],
            'SSG': ['executive-receipt'],
            'Signed Gov': ['executive-signature'],
            'HDSG': ['executive-signature'],
            'HSG': ['executive-receipt'],
            'EFF': None,
            'HRP': None,
            'STH': None,
            'HTS': None,
        }

        if not session:
            session = self.latest_session()
            self.info('no session specified, using %s', session)
        sid = SESSION_SITE_IDS[session]

        legislation = backoff(
            self.lservice.GetLegislationForSession,
            sid
        )['LegislationIndex']

        for leg in legislation:
            lid = leg['Id']
            instrument = backoff(self.lservice.GetLegislationDetail, lid)
            history = [x for x in instrument['StatusHistory'][0]]

            actions = reversed([{
                'code': x['Code'],
                'action': x['Description'],
                '_guid': x['Id'],
                'date': x['Date']
            } for x in history])

            guid = instrument['Id']

            # A little bit hacky.
            bill_prefix = instrument['DocumentType']
            bill_chamber = chamber_map[bill_prefix[0]]
            bill_type = bill_type_map[bill_prefix[1:]]

            bill_id = '%s %s' % (
                bill_prefix,
                instrument['Number'],
            )
            if instrument['Suffix']:
                bill_id += instrument['Suffix']

            title = instrument['Caption']
            description = instrument['Summary']

            if title is None:
                continue

            bill = Bill(
                bill_id, legislative_session=session, chamber=bill_chamber, title=title,
                classification=bill_type)
            bill.add_abstract(description, note='description')
            bill.extras = {'guid': guid}

            if instrument['Votes']:
                for vote_ in instrument['Votes']:
                    _, vote_ = vote_
                    vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId'])

                    vote = VoteEvent(
                        start_date=vote_['Date'].strftime('%Y-%m-%d'),
                        motion_text=vote_['Caption'] or 'Vote on Bill',
                        chamber={'House': 'lower', 'Senate': 'upper'}[vote_['Branch']],
                        result='pass' if vote_['Yeas'] > vote_['Nays'] else 'fail',
                        classification='passage',
                        bill=bill,
                    )
                    vote.set_count('yes', vote_['Yeas'])
                    vote.set_count('no', vote_['Nays'])
                    vote.set_count('other', vote_['Excused'] + vote_['NotVoting'])

                    vote.add_source(self.vsource)

                    methods = {'Yea': 'yes', 'Nay': 'no'}

                    for vdetail in vote_['Votes'][0]:
                        whom = vdetail['Member']
                        how = vdetail['MemberVoted']
                        vote.vote(methods.get(how, 'other'), whom['Name'])

                    yield vote

            ccommittees = defaultdict(list)
            committees = instrument['Committees']
            if committees:
                for committee in committees[0]:
                    ccommittees[{
                        'House': 'lower',
                        'Senate': 'upper',
                    }[committee['Type']]].append(committee['Name'])

            for action in actions:
                action_chamber = chamber_map[action['code'][0]]

                try:
                    action_types = action_code_map[action['code']]
                except KeyError:
                    error_msg = 'Code {code} for action {action} not recognized.'.format(
                        code=action['code'], action=action['action'])

                    self.logger.warning(error_msg)

                    action_types = None

                committees = []
                if action_types and any(('committee' in x for x in action_types)):
                    committees = [str(x) for x in ccommittees.get(
                        action_chamber, [])]

                act = bill.add_action(
                    action['action'], action['date'].strftime('%Y-%m-%d'),
                    classification=action_types,
                    chamber=action_chamber)
                for committee in committees:
                    act.add_related_entity(committee, 'organization')
                act.extras = {
                    'code': action['code'],
                    'guid': action['_guid'],
                }

            sponsors = []
            if instrument['Authors']:
                sponsors = instrument['Authors']['Sponsorship']
                if 'Sponsors' in instrument and instrument['Sponsors']:
                    sponsors += instrument['Sponsors']['Sponsorship']

            sponsors = [
                (x['Type'], self.get_member(x['MemberId'])) for x in sponsors
            ]

            for typ, sponsor in sponsors:
                name = '{First} {Last}'.format(**dict(sponsor['Name']))
                bill.add_sponsorship(
                    name,
                    entity_type='person',
                    classification='primary' if 'Author' in typ else 'secondary',
                    primary='Author' in typ,
                )

            for version in instrument['Versions']['DocumentDescription']:
                name, url, doc_id, version_id = [
                    version[x] for x in [
                        'Description',
                        'Url',
                        'Id',
                        'Version'
                    ]
                ]
                # link = bill.add_version_link(
                #     name, url, media_type='application/pdf')
                # link['extras'] = {
                #     '_internal_document_id': doc_id,
                #     '_version_id': version_id
                # }

            bill.add_source(self.msource)
            bill.add_source(self.lsource)
            bill.add_source(SOURCE_URL.format(**{
                'session': session,
                'bid': guid,
            }))

            yield bill
Пример #23
0
    def scrape_bill(self, session, bill_id, chamber):
        # https://malegislature.gov/Bills/189/SD2739
        session_for_url = self.replace_non_digits(session)
        bill_url = "https://malegislature.gov/Bills/{}/{}".format(
            session_for_url, bill_id
        )

        try:
            response = self.get(bill_url)
            self.info("GET (with `requests`) - {}".format(bill_url))
        except requests.exceptions.RequestException:
            self.warning(u"Server Error on {}".format(bill_url))
            return False

        html = response.text

        page = lxml.html.fromstring(html)

        if not page.xpath('//div[contains(@class, "followable")]/h1/text()'):
            self.warning(u"Server Error on {}".format(bill_url))
            return False

        # The state website will periodically miss a few bills' titles for a few days
        # These titles will be extant on the bill list page, but missing on the bill detail page
        # The titles are eventually populated
        try:
            bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[
                0
            ]
        except IndexError:
            self.warning("Couldn't find title for {}; skipping".format(bill_id))
            return False

        bill_types = ["H", "HD", "S", "SD", "SRes"]
        if re.sub("[0-9]", "", bill_id) not in bill_types:
            self.warning("Unsupported bill type for {}; skipping".format(bill_id))
            return False

        if "SRes" in bill_id:
            bill_id = bill_id.replace("SRes", "SR")

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification="bill",
        )

        bill_summary = None
        if page.xpath('//p[@id="pinslip"]/text()'):
            bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0]
        if bill_summary:
            bill.add_abstract(bill_summary, "summary")

        bill.add_source(bill_url)

        # https://malegislature.gov/Bills/189/SD2739 has a presenter
        # https://malegislature.gov/Bills/189/S2168 no sponsor
        # Find the non-blank text of the dt following Sponsor or Presenter,
        # including any child link text.
        sponsor = page.xpath(
            '//dt[text()="Sponsor:" or text()="Presenter:"]/'
            "following-sibling::dd/descendant-or-self::*/text()[normalize-space()]"
        )
        if sponsor:
            sponsor = sponsor[0].strip()
            bill.add_sponsorship(
                sponsor, classification="primary", primary=True, entity_type="person"
            )

        self.scrape_cosponsors(bill, bill_url)

        version = page.xpath(
            "//div[contains(@class, 'modalBtnGroup')]/"
            "a[contains(text(), 'Download PDF') and not(@disabled)]/@href"
        )
        if version:
            version_url = "https://malegislature.gov{}".format(version[0])
            bill.add_version_link(
                "Bill Text", version_url, media_type="application/pdf"
            )

        # yield back votes and bill
        # XXX  yield from
        self.scrape_actions(bill, bill_url, session)
        yield bill
Пример #24
0
    def scrape(self, session=None):
        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)

        self._bill_prefix_map = {
            "HB": {
                "type": "bill",
                "url_segment": "bills/house"
            },
            "HR": {
                "type": "resolution",
                "url_segment": "resolutions/house/simple"
            },
            "HCR": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/house/concurrent",
            },
            "HJR": {
                "type": "joint resolution",
                "url_segment": "resolutions/house/joint",
            },
            "HC": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/house/concurrent",
            },
            "HJ": {
                "type": "joint resolution",
                "url_segment": "resolutions/house/joint",
            },
            "SB": {
                "type": "bill",
                "url_segment": "bills/senate"
            },
            "SR": {
                "type": "resolution",
                "url_segment": "resolutions/senate/simple"
            },
            "SCR": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/senate/concurrent",
            },
            "SJR": {
                "type": "joint resolution",
                "url_segment": "resolutions/senate/joint",
            },
            "SC": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/senate/concurrent",
            },
            "SJ": {
                "type": "joint resolution",
                "url_segment": "resolutions/senate/joint",
            },
        }

        api_base_url = "https://api.iga.in.gov"
        proxy = {"url": "http://in-proxy.openstates.org"}

        # ah, indiana. it's really, really hard to find
        # pdfs in their web interface. Super easy with
        # the api, but a key needs to be passed
        # in the headers. To make these documents
        # viewable to the public and our scrapers,
        # sunlight's put up a proxy service at this link
        # using our api key for pdf document access.

        client = ApiClient(self)
        r = client.get("bills", session=session)
        all_pages = client.unpaginate(r)
        for b in all_pages:
            bill_id = b["billName"]
            for idx, char in enumerate(bill_id):
                try:
                    int(char)
                except ValueError:
                    continue
                disp_bill_id = bill_id[:idx] + " " + str(int(bill_id[idx:]))
                break

            bill_link = b["link"]
            api_source = api_base_url + bill_link
            try:
                bill_json = client.get("bill",
                                       session=session,
                                       bill_id=bill_id.lower())
            except scrapelib.HTTPError:
                self.logger.warning("Bill could not be accessed. Skipping.")
                continue

            title = bill_json["description"]
            if title == "NoneNone":
                title = None
            # sometimes description is blank
            # if that's the case, we can check to see if
            # the latest version has a short description
            if not title:
                title = bill_json["latestVersion"]["shortDescription"]

            # and if that doesn't work, use the bill_id but throw a warning
            if not title:
                title = bill_id
                self.logger.warning(
                    "Bill is missing a title, using bill id instead.")

            bill_prefix = self._get_bill_id_components(bill_id)[0]

            original_chamber = ("lower" if bill_json["originChamber"].lower()
                                == "house" else "upper")
            bill_type = self._bill_prefix_map[bill_prefix]["type"]
            bill = Bill(
                disp_bill_id,
                legislative_session=session,
                chamber=original_chamber,
                title=title,
                classification=bill_type,
            )

            bill.add_source(self._get_bill_url(session, bill_id))
            bill.add_source(api_source)

            # sponsors
            for s in bill_json["authors"]:
                bill.add_sponsorship(
                    classification="author",
                    name=self._get_name(s),
                    entity_type="person",
                    primary=True,
                )

            for s in bill_json["coauthors"]:
                bill.add_sponsorship(
                    classification="coauthor",
                    name=self._get_name(s),
                    entity_type="person",
                    primary=False,
                )

            for s in bill_json["sponsors"]:
                bill.add_sponsorship(
                    classification="sponsor",
                    name=self._get_name(s),
                    entity_type="person",
                    primary=True,
                )

            for s in bill_json["cosponsors"]:
                bill.add_sponsorship(
                    classification="cosponsor",
                    name=self._get_name(s),
                    entity_type="person",
                    primary=False,
                )

            # actions
            action_link = bill_json["actions"]["link"]
            api_source = api_base_url + action_link

            try:
                actions = client.get("bill_actions",
                                     session=session,
                                     bill_id=bill_id.lower())
            except scrapelib.HTTPError:
                self.logger.warning("Could not find bill actions page")
                actions = {"items": []}

            for a in actions["items"]:
                action_desc = a["description"]
                if "governor" in action_desc.lower():
                    action_chamber = "executive"
                elif a["chamber"]["name"].lower() == "house":
                    action_chamber = "lower"
                else:
                    action_chamber = "upper"
                date = a["date"]

                if not date:
                    self.logger.warning("Action has no date, skipping")
                    continue

                # convert time to pupa fuzzy time
                date = date.replace("T", " ")
                # TODO: if we update pupa to accept datetimes we can drop this line
                date = date.split()[0]

                action_type = []
                d = action_desc.lower()
                committee = None

                reading = False
                if "first reading" in d:
                    action_type.append("reading-1")
                    reading = True

                if "second reading" in d or "reread second time" in d:
                    action_type.append("reading-2")
                    reading = True

                if "third reading" in d or "reread third time" in d:
                    action_type.append("reading-3")
                    if "passed" in d:
                        action_type.append("passage")
                    if "failed" in d:
                        action_type.append("failure")
                    reading = True

                if "adopted" in d and reading:
                    action_type.append("passage")

                if ("referred" in d and "committee on" in d
                        or "reassigned" in d and "committee on" in d):
                    committee = d.split("committee on")[-1].strip()
                    action_type.append("referral-committee")

                if "committee report" in d:
                    if "pass" in d:
                        action_type.append("committee-passage")
                    if "fail" in d:
                        action_type.append("committee-failure")

                if "amendment" in d and "without amendment" not in d:
                    if "pass" in d or "prevail" in d or "adopted" in d:
                        action_type.append("amendment-passage")
                    if "fail" or "out of order" in d:
                        action_type.append("amendment-failure")
                    if "withdraw" in d:
                        action_type.append("amendment-withdrawal")

                if "signed by the governor" in d:
                    action_type.append("executive-signature")

                if len(action_type) == 0:
                    # calling it other and moving on with a warning
                    self.logger.warning(
                        "Could not recognize an action in '{}'".format(
                            action_desc))
                    action_type = None

                a = bill.add_action(
                    chamber=action_chamber,
                    description=action_desc,
                    date=date,
                    classification=action_type,
                )
                if committee:
                    a.add_related_entity(committee, entity_type="organization")

            # subjects
            subjects = [
                s["entry"] for s in bill_json["latestVersion"]["subjects"]
            ]
            for subject in subjects:
                bill.add_subject(subject)

            # Abstract
            if bill_json["latestVersion"]["digest"]:
                bill.add_abstract(bill_json["latestVersion"]["digest"],
                                  note="Digest")

            # versions and votes
            for version in bill_json["versions"][::-1]:
                try:
                    version_json = client.get(
                        "bill_version",
                        session=session,
                        bill_id=version["billName"],
                        version_id=version["printVersionName"],
                    )
                except scrapelib.HTTPError:
                    self.logger.warning("Bill version does not seem to exist.")
                    continue

                yield from self.deal_with_version(version_json, bill, bill_id,
                                                  original_chamber, session,
                                                  proxy)

            yield bill
Пример #25
0
    def _parse_senate_billpage(self, bill_url, year):
        bill_page = self.lxmlize(bill_url)

        # get all the info needed to record the bill
        # TODO probably still needs to be fixed
        bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content()
        bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content()
        bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content()
        # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()

        bill_type = "bill"
        triplet = bill_id[:3]
        if triplet in bill_types:
            bill_type = bill_types[triplet]

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.info("With subjects for this bill")

        self.info(bid)

        bill = Bill(
            bill_id,
            title=bill_desc,
            chamber='upper',
            legislative_session=year,
            classification=bill_type,
        )
        bill.subject = subs
        bill.add_abstract(bill_desc, note='abstract')
        bill.add_source(bill_url)

        if bill_title:
            bill.add_title(bill_title)

        # Get the primary sponsor
        sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0]
        bill_sponsor = sponsor.text_content()
        # bill_sponsor_link = sponsor.attrib.get('href')
        bill.add_sponsorship(
            bill_sponsor,
            entity_type='person',
            classification='primary',
            primary=True,
        )

        # cosponsors show up on their own page, if they exist
        cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]')
        if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'):
            self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href'])

        # get the actions
        action_url = bill_page.xpath('//a[@id="hlAllActions"]')
        if len(action_url) > 0:
            action_url = action_url[0].attrib['href']
            self._parse_senate_actions(bill, action_url)

        # stored on a separate page
        versions_url = bill_page.xpath('//a[@id="hlFullBillText"]')
        if len(versions_url) > 0 and versions_url[0].attrib.get('href'):
            self._parse_senate_bill_versions(bill, versions_url[0].attrib['href'])

        yield bill
Пример #26
0
    def scrape_matter(self, matter_link, sess):
        matter_types = {
            "Additions": "other",
            "Administrative Order": "order",
            "Annual Evaluation": "other",
            "Bid Advertisement": "other",
            "Bid Awards": "other",
            "Bid Contract": "contract",
            "Bid Protest": "other",
            "Bid Rejection": "other",
            "Birthday Scroll": "commemoration",
            "Certificate of Appreciation": "commemoration",
            "Change Order": "order",
            "Citizen's Presentation": "other",
            "Commendation": "commemoration",
            "Conflict Waiver": "other",
            "Congratulatory Certificate": "commemoration",
            "Deferrals": "other",
            "Discussion Item": "other",
            "Distinguished Visitor": "other",
            "Joint Meeting/Workshop": "other",
            "Mayoral Veto": "other",
            "Miscellaneous": "other",
            "Nomination": "nomination",
            "Oath of Office": "other",
            "Omnibus Reserve": "bill",
            "Ordinance": "ordinance",
            "Plaque": "commemoration",
            "Presentation": "other",
            "Proclamation": "proclamation",
            "Professional Service Agreement": "contract",
            "Public Hearing": "other",
            "Report": "other",
            "Request for Proposals": "other",
            "Request for Qualifications": "other",
            "Request to Advertise": "other",
            "Resolution": "resolution",
            "Resolution of Sympathy": "resolution",
            "Service Awards": "commemoration",
            "Special Item": "other",
            "Special Presentation": "other",
            "Supplement": "other",
            "Swearing-In": "other",
            "Time Sensitive Items": "other",
            "Withdrawals": "other",
            "Workshop Item": "other",
            "Zoning": "other",
            "Zoning Resolution": "resolution"
        }
        matter_doc = self.lxmlize(matter_link)
        info_dict = self.matter_table_to_dict(matter_doc)
        #we're going to use the year of the intro date as the session
        #until/unless we come up with something better
        intro_date = datetime.strptime(info_dict["Introduced"], "%m/%d/%Y")
        session = sess["identifier"]
        category = matter_types[info_dict["File Type"]]
        if 'File Name' in info_dict:
            title = info_dict["File Name"]
        elif "Title" in info_dict and info_dict["Title"].strip():
            title = info_dict["Title"].strip()
        else:
            self.warning("bill has no title")
            return
        if category == 'other':
            bill = Bill(identifier=info_dict["File Number"],
                        legislative_session=session,
                        title=title)
        else:
            bill = Bill(identifier=info_dict["File Number"],
                        legislative_session=session,
                        title=title,
                        classification=category)
        for spons in info_dict["Sponsors"]:
            if spons == "NONE":
                continue
            try:
                name, spons_type = spons.rsplit(",", 1)
            except ValueError:
                name = spons
                spons_type = "Sponsor"
            primary = True if "Prime Sponsor" in spons_type else False
            entity = "person"
            if "committee" in name:
                entity = committee
            bill.add_sponsorship(name, spons_type, entity, primary)
        if "Indexes" in info_dict:
            for subj in info_dict["Indexes"]:
                if subj.strip() and subj.strip() != "NONE":
                    bill.add_subject(subj.strip())
        if "Title" in info_dict and info_dict["Title"].strip():
            note = "bill's long title'"
            if ("Note" in info_dict and info_dict["Note"].strip()):
                note = info_dict["Note"]
            bill.add_abstract(abstract=info_dict["Title"], note=note)
        self.process_action_table(matter_doc, bill)
        bill.add_source(matter_link, note='web')

        yield bill
Пример #27
0
    def scrape(self):
        for leg_summary in self.legislation(created_after=datetime.datetime(2014, 1, 1)) :
            leg_type = BILL_TYPES[leg_summary['Type']]
            
            bill = Bill(identifier=leg_summary['File\xa0#'],
                        title=leg_summary['Title'],
                        legislative_session=None,
                        classification=leg_type,
                        from_organization={"name":"New York City Council"})
            bill.add_source(leg_summary['url'])

            leg_details = self.legDetails(leg_summary['url'])
            history = self.history(leg_summary['url'])

            bill.add_title(leg_details['Name'], 
                           note='created by administrative staff')

            if 'Summary' in leg_details :
                bill.add_abstract(leg_details['Summary'], note='')

            if leg_details['Law number'] :
                bill.add_identifier(leg_details['Law number'], 
                                    note='law number')

            for sponsorship in self._sponsors(leg_details.get('Sponsors', [])) :
                sponsor, sponsorship_type, primary = sponsorship
                bill.add_sponsorship(sponsor, sponsorship_type,
                                     'person', primary, 
                                     entity_id = make_pseudo_id(name=sponsor))

            
            for attachment in leg_details.get('Attachments', []) :
                bill.add_document_link(attachment['label'],
                                       attachment['url'],
                                       media_type="application/pdf")

            history = list(history)

            if history :
                earliest_action = min(self.toTime(action['Date']) 
                                      for action in history)

                bill.legislative_session = self.sessions(earliest_action)
            else :
                bill.legislative_session = str(self.SESSION_STARTS[0])

            for action in history :
                action_description = action['Action']
                if not action_description :
                    continue
                    
                action_class = ACTION_CLASSIFICATION[action_description]

                action_date = self.toDate(action['Date'])
                responsible_org = action['Action\xa0By']
                if responsible_org == 'City Council' :
                    responsible_org = 'New York City Council'
                elif responsible_org == 'Administration' :
                    responsible_org = 'Mayor'
                   
                if responsible_org == 'Town Hall Meeting' :
                    continue
                else :
                    act = bill.add_action(action_description,
                                          action_date,
                                          organization={'name': responsible_org},
                                          classification=action_class)

                if 'url' in action['Action\xa0Details'] :
                    action_detail_url = action['Action\xa0Details']['url']
                    if action_class == 'committee-referral' :
                        action_details = self.actionDetails(action_detail_url)
                        referred_committee = action_details['Action text'].rsplit(' to the ', 1)[-1]
                        act.add_related_entity(referred_committee,
                                               'organization',
                                               entity_id = make_pseudo_id(name=referred_committee))
                    result, votes = self.extractVotes(action_detail_url)
                    if votes :
                        action_vote = VoteEvent(legislative_session=bill.legislative_session, 
                                           motion_text=action_description,
                                           organization={'name': responsible_org},
                                           classification=action_class,
                                           start_date=action_date,
                                           result=result,
                                           bill=bill)
                        action_vote.add_source(action_detail_url)

                        for option, voter in votes :
                            action_vote.vote(option, voter)


                        yield action_vote
            
            text = self.text(leg_summary['url'])

            if text :
                bill.extras = {'local_classification' : leg_summary['Type'],
                               'full_text' : text}
            else :
                bill.extras = {'local_classification' : leg_summary['Type']}

            yield bill
Пример #28
0
    def scrape(self):
        for agenda_item in self.agendaItems(date_from=self.start_date, date_to=self.end_date):
            # TODO: Add agenda_item type to OCD
            leg_type = "bill"

            title = agenda_item["Title"].replace("\n", " ")
            title_re = re.compile(
                "^(.+?)(?: - (?:by )?((?:Deputy )?Mayor|Councillor) (.+), seconded by ((?:Deputy )?Mayor|Councillor) (.+))?$"
            )
            title, primary_role, primary_sponsor, secondary_role, secondary_sponsor = re.match(title_re, title).groups()

            b = Bill(
                identifier=agenda_item["Item No."],
                title=title,
                legislative_session=None,
                classification=leg_type,
                from_organization={"name": self.jurisdiction.name},
            )
            b.add_source(agenda_item["url"], note="web")

            if primary_sponsor and secondary_sponsor:
                b.add_sponsorship(primary_sponsor, "mover", "person", True)
                b.add_sponsorship(secondary_sponsor, "seconder", "person", False)

            # TODO: Fake session for now
            b.legislative_session = "2014-2018"

            agenda_item_versions = self.agendaItemVersions(agenda_item["url"])

            # Use one version's full_text (will be most recent)
            b.extras["full_text"] = agenda_item_versions[0]["full_text"]

            for version in agenda_item_versions:
                action_date = self.toDate(version["date"])

                if "Summary" in version["sections"]:
                    # TODO: Investigate whether these vary between versions, as
                    # we perhaps don't need to add one for each
                    b.add_abstract(version["sections"]["Summary"], note="", date=action_date)

                if not version["action"]:
                    continue
                if re.match(r"\d+:\d+ [A|P]M", version["action"]):
                    continue

                action_description = version["action"]
                responsible_org = version["responsible_org"]
                action_class = ACTION_CLASSIFICATION.get(version["action"])

                def is_recommendation(version):
                    return any("Recommendations" in s for s in version["sections"].keys())

                if responsible_org == "City Council":
                    responsible_org = self.jurisdiction.name
                else:
                    if action_class == "passage":
                        action_class = "committee-passage"

                        if is_recommendation(version):
                            action_class = "committee-passage-favorable"

                b.add_action(
                    action_description, action_date, organization={"name": responsible_org}, classification=action_class
                )

            yield b
Пример #29
0
    def scrape_bill(self, bill_id):
        old = self.api('bills/' + bill_id + '?')

        # not needed
        old.pop('id')
        old.pop('state')
        old.pop('level', None)
        old.pop('country', None)
        old.pop('created_at')
        old.pop('updated_at')
        old.pop('action_dates')
        old.pop('+bill_type',None)
        old.pop('+subject', None)
        old.pop('+scraped_subjects', None)
        old.pop('subjects', [])

        classification = old.pop('type')

        # ca weirdness
        if 'fiscal committee' in classification:
            classification.remove('fiscal committee')
        if 'urgency' in classification:
            classification.remove('urgency')
        if 'local program' in classification:
            classification.remove('local program')
        if 'tax levy' in classification:
            classification.remove('tax levy')

        if classification[0] in ['miscellaneous', 'jres', 'cres']:
            return

        if classification == ['memorial resolution'] and self.state == 'ar':
            classification = ['memorial']
        if classification == ['concurrent memorial resolution'] and self.state == 'ar':
            classification = ['concurrent memorial']
        if classification == ['joint session resolution'] and self.state == 'il':
            classification = ['joint resolution']
        if classification == ['legislative resolution'] and self.state == 'ny':
            classification = ['resolution']
        if classification == ['address'] and self.state == 'nh':
            classification = ['resolution']

        if not old['title'] and self.state == 'me':
            old['title'] = '(unknown)'

        chamber = old.pop('chamber')
        if self.state in ('ne', 'dc'):
            chamber = 'legislature'
        elif chamber in ('joint', 'conference'):
            chamber = 'legislature'

        new = Bill(old.pop('bill_id'), old.pop('session'), old.pop('title'),
                   chamber=chamber, classification=classification)

        abstract = old.pop('summary', None)
        if abstract:
            new.add_abstract(abstract, note='')

        for title in old.pop('alternate_titles'):
            new.add_title(title)

        for doc in old.pop('documents'):
            new.add_document_link(doc['name'], doc['url'], on_duplicate='ignore')

        for doc in old.pop('versions'):
            new.add_version_link(doc['name'], doc['url'], media_type=doc.pop('mimetype', ''))

        for subj in old.pop('scraped_subjects', []):
            if subj:
                new.add_subject(subj)

        for spon in old.pop('sponsors'):
            if spon.get('committee_id') is not None:
                entity_type = 'organization'
            elif spon.get('leg_id') is not None:
                entity_type = 'person'
            else:
                entity_type = ''
            new.add_sponsorship(spon['name'], spon['type'], entity_type,
                                spon['type'] == 'primary')

        for act in old.pop('actions'):
            actor = act['actor']
            if actor.lower() in ('governor', 'mayor', 'secretary of state'):
                actor = 'executive'
            elif actor.lower() == 'house' or (actor.lower().startswith('lower (') and self.state == 'ca'):
                actor = 'lower'
            elif actor.lower() in ('senate', 'upper`') or (actor.lower().startswith('upper (') and self.state == 'ca'):
                actor = 'upper'
            elif actor in ('joint', 'other', 'Data Systems', 'Speaker', 'clerk',
                           'Office of the Legislative Fiscal Analyst', 'Became Law w',
                           'conference') or (actor.lower().startswith('legislature (') and self.state == 'ca'):
                actor = 'legislature'

            if actor in ('committee', 'sponsor') and self.state == 'pr':
                actor = 'legislature'

            # nebraska & DC
            if actor in ('upper','council') and self.state in ('ne', 'dc'):
                actor = 'legislature'

            if act['action']:
                newact = new.add_action(act['action'], act['date'][:10], chamber=actor,
                                        classification=[action_types[c] for c in act['type'] if c != 'other'])
                for re in act.get('related_entities', []):
                    if re['type'] == 'committee':
                        re['type'] = 'organization'
                    elif re['type'] == 'legislator':
                        re['type'] = 'person'
                    newact.add_related_entity(re['name'], re['type'])

        for comp in old.pop('companions', []):
            if self.state in ('nj', 'ny', 'mn'):
                rtype = 'companion'
            new.add_related_bill(comp['bill_id'], comp['session'], rtype)

        for abid in old.pop('alternate_bill_ids', []) + old.pop('+alternate_bill_ids', []):
            new.add_identifier(abid)


        # generic OpenStates stuff
        for id in old.pop('all_ids'):
            new.add_identifier(id, scheme='openstates')

        for source in old.pop('sources'):
            source.pop('retrieved', None)
            new.add_source(**source)

        ext_title = old.pop('+extended_title', None)
        if ext_title:
            new.add_title(ext_title, note='Extended Title')
        official_title = old.pop('+official_title', None)
        if official_title:
            new.add_title(official_title, note='Official Title')

        to_extras = ['+status', '+final_disposition', '+volume_chapter', '+ld_number', '+referral',
                     '+companion', '+description', '+fiscal_note_probable:',
                     '+preintroduction_required:', '+drafter', '+category:', '+chapter',
                     '+requester', '+transmittal_date:', '+by_request_of', '+bill_draft_number:',
                     '+bill_lr', '+bill_url', '+rcs_num', '+fiscal_note', '+impact_clause', '+fiscal_notes',
                     '+short_title', '+type_', '+conference_committee', 'conference_committee',
                     '+companion_bill_ids', '+additional_information']
        for k in to_extras:
            v = old.pop(k, None)
            if v:
                new.extras[k.replace('+', '')] = v

        # votes
        vote_no = 1
        for vote in old.pop('votes'):
            vote.pop('id')
            vote.pop('state')
            vote.pop('bill_id')
            vote.pop('bill_chamber', None)
            vote.pop('+state', None)
            vote.pop('+country', None)
            vote.pop('+level', None)
            vote.pop('+vacant', None)
            vote.pop('+not_voting', None)
            vote.pop('+amended', None)
            vote.pop('+excused', None)
            vote.pop('+NV', None)
            vote.pop('+AB', None)
            vote.pop('+P', None)
            vote.pop('+V', None)
            vote.pop('+E', None)
            vote.pop('+EXC', None)
            vote.pop('+EMER', None)
            vote.pop('+present', None)
            vote.pop('+absent', None)
            vote.pop('+seconded', None)
            vote.pop('+moved', None)
            vote.pop('+vote_type', None)
            vote.pop('+actual_vote', None)
            vote.pop('+skip_votes', None)
            vote.pop('vote_id')
            vote.pop('+bill_chamber', None)
            vote.pop('+session', None)
            vote.pop('+bill_id', None)
            vote.pop('+bill_session', None)
            vote.pop('committee', None)
            vote.pop('committee_id', None)
            vtype = vote.pop('type', 'passage')

            if vtype == 'veto_override':
                vtype = ['veto-override']
            elif vtype == 'amendment':
                vtype = ['amendment-passage']
            elif vtype == 'other':
                vtype = ''
            else:
                vtype = ['bill-passage']

            # most states need identifiers for uniqueness, just do it everywhere
            identifier = vote['date'] + '-' + str(vote_no)
            vote_no += 1

            chamber = vote.pop('chamber')
            if chamber == 'upper' and self.state in ('ne', 'dc'):
                chamber = 'legislature'
            elif chamber == 'joint':
                chamber = 'legislature'

            newvote = VoteEvent(legislative_session=vote.pop('session'),
                           motion_text=vote.pop('motion'),
                           result='pass' if vote.pop('passed') else 'fail',
                           chamber=chamber,
                           start_date=vote.pop('date'),
                           classification=vtype,
                           bill=new,
                           identifier=identifier)
            for vt in ('yes', 'no', 'other'):
                newvote.set_count(vt, vote.pop(vt + '_count'))
                for name in vote.pop(vt + '_votes'):
                    newvote.vote(vt, name['name'])

            for source in vote.pop('sources'):
                source.pop('retrieved', None)
                newvote.add_source(**source)

            if not newvote.sources:
                newvote.sources = new.sources

            to_extras = ['+record', '+method', 'method', '+filename', 'record', '+action',
                         '+location', '+rcs_num', '+type_', '+threshold', '+other_vote_detail',
                         '+voice_vote']
            for k in to_extras:
                v = vote.pop(k, None)
                if v:
                    newvote.extras[k.replace('+', '')] = v

            assert not vote, vote.keys()
            yield newvote

        assert not old, old.keys()

        yield new
Пример #30
0
    def _parse_senate_billpage(self, bill_url, year):
        bill_page = self.lxmlize(bill_url)

        # get all the info needed to record the bill
        # TODO probably still needs to be fixed
        bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content()
        bill_title = bill_page.xpath(
            '//*[@id="lblBillTitle"]')[0].text_content()
        bill_desc = bill_page.xpath(
            '//*[@id="lblBriefDesc"]')[0].text_content()
        # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()

        bill_type = "bill"
        triplet = bill_id[:3]
        if triplet in bill_types:
            bill_type = bill_types[triplet]

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.info("With subjects for this bill")

        self.info(bid)

        if bid == 'XXXXXX':
            self.info("Skipping Junk Bill")
            return

        bill = Bill(
            bill_id,
            title=bill_desc,
            chamber='upper',
            legislative_session=self._session_id,
            classification=bill_type,
        )
        bill.subject = subs
        bill.add_abstract(bill_desc, note='abstract')
        bill.add_source(bill_url)

        if bill_title:
            bill.add_title(bill_title)

        # Get the primary sponsor
        sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0]
        bill_sponsor = sponsor.text_content()
        # bill_sponsor_link = sponsor.attrib.get('href')
        bill.add_sponsorship(
            bill_sponsor,
            entity_type='person',
            classification='primary',
            primary=True,
        )

        # cosponsors show up on their own page, if they exist
        cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]')
        if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'):
            self._parse_senate_cosponsors(bill,
                                          cosponsor_tag[0].attrib['href'])

        # get the actions
        action_url = bill_page.xpath('//a[@id="hlAllActions"]')
        if len(action_url) > 0:
            action_url = action_url[0].attrib['href']
            self._parse_senate_actions(bill, action_url)

        # stored on a separate page
        versions_url = bill_page.xpath('//a[@id="hlFullBillText"]')
        if len(versions_url) > 0 and versions_url[0].attrib.get('href'):
            self._parse_senate_bill_versions(bill,
                                             versions_url[0].attrib['href'])

        amendment_links = bill_page.xpath(
            '//a[contains(@href,"ShowAmendment.asp")]')
        for link in amendment_links:
            link_text = link.xpath('string(.)').strip()
            if 'adopted' in link_text.lower():
                link_url = link.xpath('@href')[0]
                bill.add_version_link(link_text,
                                      link_url,
                                      media_type='application/pdf',
                                      on_duplicate='ignore')

        yield bill
Пример #31
0
    def scrape_bill(self, bill_num, session):
        chamber_map = {'House': 'lower', 'Senate': 'upper', 'LSO': 'executive'}
        # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45
        bill_json_url = 'http://wyoleg.gov/LsoService/api/BillInformation/{}/' \
                        '{}?calendarDate='.format(
                            session, bill_num)
        response = self.get(bill_json_url)
        bill_json = json.loads(response.content.decode('utf-8'))

        chamber = 'lower' if bill_json['bill'][0] else 'upper'

        bill = Bill(identifier=bill_json['bill'],
                    legislative_session=session,
                    title=bill_json['catchTitle'],
                    chamber=chamber,
                    classification="bill",
                    )

        bill.add_title(bill_json['billTitle'])

        source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format(session,
                                                                      bill_json['bill'])
        bill.add_source(source_url)

        for action_json in bill_json['billActions']:
            utc_action_date = self.parse_local_date(action_json['statusDate'])

            actor = None
            if action_json['location'] and action_json['location'] in chamber_map:
                actor = chamber_map[action_json['location']]

            action = bill.add_action(
                chamber=actor,
                description=action_json['statusMessage'],
                date=utc_action_date,
                classification=categorize_action(action_json['statusMessage']),
            )

            action.extras = {
                'billInformationID': action_json['billInformationID']}

        if bill_json['introduced']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['introduced'])

            bill.add_version_link(note="Introduced",
                                  url=url,
                                  media_type="application/pdf"  # optional but useful!
                                  )

        if bill_json['enrolledAct']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['enrolledAct'])

            bill.add_version_link(note="Enrolled",
                                  url=url,
                                  media_type="application/pdf"  # optional but useful!
                                  )

        if bill_json['fiscalNote']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['fiscalNote'])

            bill.add_document_link(note="Fiscal Note",
                                   url=url,
                                   media_type="application/pdf"  # optional but useful!
                                   )

        if bill_json['digest']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['digest'])

            bill.add_document_link(note="Bill Digest",
                                   url=url,
                                   media_type="application/pdf"  # optional but useful!
                                   )

        if bill_json['vetoes']:
            for veto in bill_json['vetoes']:
                url = 'http://wyoleg.gov/{}'.format(veto['vetoLinkPath'])
                bill.add_version_link(note=veto['vetoLinkText'],
                                      url=url,
                                      media_type="application/pdf"  # optional but useful!
                                      )

        for amendment in bill_json['amendments']:
            # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf
            url = 'http://wyoleg.gov/{}/Amends/{}.pdf'.format(
                session, amendment['amendmentNumber'])

            if amendment['sponsor'] and amendment['status']:
                title = 'Amendment {} ({}) - {} ({})'.format(
                    amendment['amendmentNumber'],
                    amendment['order'],
                    amendment['sponsor'],
                    amendment['status'],
                )
            else:
                title = 'Amendment {} ({})'.format(
                    amendment['amendmentNumber'],
                    amendment['order'],
                )
            # add versions of the bill text
            version = bill.add_version_link(
                note=title,
                url=url,
                media_type="application/pdf",
            )
            version['extras'] = {
                'amendmentNumber': amendment['amendmentNumber'],
                'sponsor': amendment['sponsor'],
            }

        for sponsor in bill_json['sponsors']:
            status = 'primary' if sponsor['primarySponsor'] else 'cosponsor'
            sponsor_type = 'person' if sponsor['sponsorTitle'] else 'organization'
            bill.add_sponsorship(
                name=sponsor['name'],
                classification=status,
                entity_type=sponsor_type,
                primary=sponsor['primarySponsor']
            )

        if bill_json['summary']:
            bill.add_abstract(
                note="summary",
                abstract=bill_json['summary'],
            )

        if bill_json['enrolledNumber']:
            bill.extras['wy_enrolled_number'] = bill_json['enrolledNumber']

        if bill_json['chapter']:
            bill.extras['chapter'] = bill_json['chapter']

        if bill_json['effectiveDate']:
            eff = datetime.datetime.strptime(
                bill_json['effectiveDate'], '%m/%d/%Y')
            bill.extras['effective_date'] = eff.strftime('%Y-%m-%d')

        bill.extras['wy_bill_id'] = bill_json['id']

        for vote_json in bill_json['rollCalls']:
            yield from self.scrape_vote(bill, vote_json, session)

        yield bill
Пример #32
0
    def _scrape_bill(self, session, bill_data):
        details = self._parse_bill_details(bill_data)

        (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title,
         (prefix, number, active_version)) = details

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=bill_chamber,
            title=title or bill_data['summary'],
            classification=bill_type,
        )

        if bill_data['summary']:
            bill.add_abstract(bill_data['summary'], note='')

        bill_active_version = bill_data['amendments']['items'][active_version]

        # Parse sponsors.
        if bill_data['sponsor'] is not None:
            if bill_data['sponsor']['rules'] is True:
                bill.add_sponsorship(
                    'Rules Committee',
                    entity_type='organization',
                    classification='primary',
                    primary=True,
                )
            elif not bill_data['sponsor']['budget']:
                primary_sponsor = bill_data['sponsor']['member']
                bill.add_sponsorship(
                    primary_sponsor['shortName'],
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

                # There *shouldn't* be cosponsors if there is no sponsor.
                cosponsors = bill_active_version['coSponsors']['items']
                for cosponsor in cosponsors:
                    bill.add_sponsorship(
                        cosponsor['shortName'],
                        entity_type='person',
                        classification='cosponsor',
                        primary=False,
                    )

        # List companion bill.
        same_as = bill_active_version.get('sameAs', {})
        # Check whether "sameAs" property is populated with at least one bill.
        if same_as['items']:
            # Get companion bill ID.
            companion_bill_id = same_as['items'][0]['basePrintNo']

            # Build companion bill session.
            start_year = same_as['items'][0]['session']
            end_year = start_year + 1
            companion_bill_session = '-'.join([str(start_year), str(end_year)])

            # Attach companion bill data.
            bill.add_related_bill(
                companion_bill_id,
                companion_bill_session,
                relation_type='companion',
            )

        # Parse actions.
        chamber_map = {
            'senate': 'upper',
            'assembly': 'lower',
        }

        for action in bill_data['actions']['items']:
            chamber = chamber_map[action['chamber'].lower()]
            action_datetime = datetime.datetime.strptime(
                action['date'], '%Y-%m-%d')
            action_date = action_datetime.date()
            types, _ = NYBillScraper.categorizer.categorize(action['text'])

            bill.add_action(
                action['text'],
                action_date.strftime('%Y-%m-%d'),
                chamber=chamber,
                classification=types,
            )

        # Handling of sources follows. Sources serving either chamber
        # maintain duplicate data, so we can see certain bill data
        # through either chamber's resources. However, we have to refer
        # to a specific chamber's resources if we want to grab certain
        # specific information such as vote data.
        #
        # As such, I'm placing all potential sources in the interest of
        # thoroughness. - Andy Lo

        # List Open Legislation API endpoint as a source.
        api_url = self.api_client.root + self.api_client.resources[
            'bill'].format(
                session_year=session, bill_id=bill_id, summary='', detail='')
        bill.add_source(api_url)
        bill.add_source(senate_url)
        bill.add_source(assembly_url)

        # Chamber-specific processing.
        if bill_chamber == 'upper':
            # Collect votes.
            for vote_data in bill_data['votes']['items']:
                yield self._parse_senate_votes(vote_data, bill, api_url)
        elif bill_chamber == 'lower':
            assembly = AssemblyBillPage(self, session, bill, details)
            yield from assembly.build()

        # A little strange the way it works out, but the Assembly
        # provides the HTML version documents and the Senate provides
        # the PDF version documents.
        amendments = bill_data['amendments']['items']
        for key, amendment in amendments.items():
            version = amendment['printNo']

            html_version = version + ' HTML'
            html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\
                '{}&term={}&Text=Y'.format(bill_id, self.term_start_year)
            bill.add_version_link(
                html_version,
                html_url,
                on_duplicate='ignore',
                media_type='text/html',
            )

            pdf_version = version + ' PDF'
            pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\
                .format(self.term_start_year, bill_id)
            bill.add_version_link(
                pdf_version,
                pdf_url,
                on_duplicate='ignore',
                media_type='application/pdf',
            )

        yield bill
Пример #33
0
    def _parse_senate_billpage(self, bill_url, year):
        bill_page = self.lxmlize(bill_url)

        # get all the info needed to record the bill
        # TODO probably still needs to be fixed
        bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content()
        bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content()
        bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content()
        # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()

        bill_type = "bill"
        triplet = bill_id[:3]
        if triplet in bill_types:
            bill_type = bill_types[triplet]

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.info("With subjects for this bill")

        self.info(bid)

        if bid == 'XXXXXX':
            self.info("Skipping Junk Bill")
            return

        bill = Bill(
            bill_id,
            title=bill_desc,
            chamber='upper',
            legislative_session=self._session_id,
            classification=bill_type,
        )
        bill.subject = subs
        bill.add_abstract(bill_desc, note='abstract')
        bill.add_source(bill_url)

        if bill_title:
            bill.add_title(bill_title)

        # Get the primary sponsor
        sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0]
        bill_sponsor = sponsor.text_content()
        # bill_sponsor_link = sponsor.attrib.get('href')
        bill.add_sponsorship(
            bill_sponsor,
            entity_type='person',
            classification='primary',
            primary=True,
        )

        # cosponsors show up on their own page, if they exist
        cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]')
        if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'):
            self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href'])

        # get the actions
        action_url = bill_page.xpath('//a[@id="hlAllActions"]')
        if len(action_url) > 0:
            action_url = action_url[0].attrib['href']
            self._parse_senate_actions(bill, action_url)

        # stored on a separate page
        versions_url = bill_page.xpath('//a[@id="hlFullBillText"]')
        if len(versions_url) > 0 and versions_url[0].attrib.get('href'):
            self._parse_senate_bill_versions(bill, versions_url[0].attrib['href'])

        amendment_links = bill_page.xpath('//a[contains(@href,"ShowAmendment.asp")]')
        for link in amendment_links:
            link_text = link.xpath('string(.)').strip()
            if 'adopted' in link_text.lower():
                link_url = link.xpath('@href')[0]
                bill.add_version_link(link_text, link_url, media_type='application/pdf',
                                      on_duplicate='ignore')

        yield bill
Пример #34
0
    def scrape_bill_type(self,
                         chamber,
                         session,
                         bill_type,
                         type_abbr,
                         committee_abbr_regex=get_committee_name_regex()):
        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_id, session, title='', chamber=chamber)
            if ((bill_id.startswith('S') and chamber == 'lower')
                    or (bill_id.startswith('A') and chamber == 'upper')):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ('http://leginfo.legislature.ca.gov/faces/'
                          'billNavClient.xhtml?bill_id=%s') % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id,
                                    source_url,
                                    media_type='text/html')

            title = ''
            type_ = ['bill']
            subject = ''
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = '//caml:DigestText/xhtml:p'
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r'\s+', ' ', t)
                    t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t)
                    chunks.append(t)
                summary = '\n\n'.join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(
                    version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime('%m/%d/%y')
                version_name = "{} - {}".format(version_date_human,
                                                version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(version_name,
                                        version_url_pdf,
                                        media_type='application/pdf',
                                        date=version_date.date())

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ('AB', 'SB'):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(version.short_title) and \
                            not version.title.lower().startswith('an act'):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == 'Yes':
                    type_.append('appropriation')

                tags = []
                if version.fiscal_committee == 'Yes':
                    tags.append('fiscal committee')
                if version.local_program == 'Yes':
                    tags.append('local program')
                if version.urgency == 'Yes':
                    tags.append('urgency')
                if version.taxlevy == 'Yes':
                    tags.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note='summary')
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras['impact_clause'] = impact_clause
            fsbill.extras['tags'] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == 'Y',
                    entity_type='person',
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {
                        'Assembly': 'lower',
                        'Senate': 'upper'
                    }[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:

                    def replacer(matchobj):
                        if matchobj:
                            return {
                                'Assembly': 'lower',
                                'Senate': 'upper'
                            }[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r'^(Assembly|Senate)', replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r'Com[s]?. on',
                             action.action) and not matched_abbrs:
                    msg = 'Failed to extract committee abbr from %r.'
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ('Mapping contains no committee name for '
                                   'abbreviation %r. Action text was %r.')
                            args = (abbr, action.action)
                            raise KeyError(msg % args)

                    committees = filter(None, committees)
                    kwargs['committees'] = committees

                    code = re.search(r'C[SXZ]\d+', actor)
                    if code is not None:
                        code = code.group()
                        kwargs['actor_info'] = {'committee_code': code}

                    assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace('Coms. on ', '')
                        act_str = act_str.replace('Com. on ' + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith('.'):
                            act_str = act_str + '.'

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ['upper', 'lower', 'legislature']:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = 'legislature'

                if actor != action.actor:
                    actor_info = kwargs.get('actor_info', {})
                    actor_info['details'] = action.actor
                    kwargs['actor_info'] = actor_info

                # Add strings for related legislators, if any.
                rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+'
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs['legislators'] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(
                    act_str,
                    date.strftime('%Y-%m-%d'),
                    chamber=actor,
                    classification=kwargs['classification'])
                for committee in kwargs.get('committees', []):
                    action.add_related_entity(committee,
                                              entity_type='organization')
                seen_actions.add((actor, act_str, date))

            for vote_num, vote in enumerate(bill.votes):
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                if not vote.location:
                    continue

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                if vote.motion:
                    motion = vote.motion.motion_text or ''
                else:
                    motion = ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ', '',
                                motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '',
                                motion)
                motion = re.sub(
                    r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                    r'Urgency Clause$', '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                # XXX this is responsible for all the CA 'committee' votes, not
                # sure if that's a feature or bug, so I'm leaving it as is...
                # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                # org = {
                # 'name': vote_location,
                # 'classification': vote_classification
                # }

                fsvote = VoteEvent(
                    motion_text=motion,
                    start_date=self._tz.localize(vote.vote_date_time),
                    result='pass' if result else 'fail',
                    classification=vtype,
                    # organization=org,
                    chamber=vote_chamber,
                    bill=fsbill,
                )
                fsvote.extras = {'threshold': vote.threshold}

                source_url = ('http://leginfo.legislature.ca.gov/faces'
                              '/billVotesClient.xhtml?bill_id={}').format(
                                  fsbill.identifier)
                fsvote.add_source(source_url)
                fsvote.pupa_id = source_url + '#' + str(vote_num)

                rc = {'yes': [], 'no': [], 'other': []}
                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        rc['yes'].append(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        rc['no'].append(record.legislator_name)
                    else:
                        rc['other'].append(record.legislator_name)

                # Handle duplicate votes
                for key in rc.keys():
                    rc[key] = list(set(rc[key]))

                for key, voters in rc.items():
                    for voter in voters:
                        fsvote.vote(key, voter)
                    # Set counts by summed votes for accuracy
                    fsvote.set_count(key, len(voters))

                yield fsvote

            yield fsbill
            self.session.expire_all()
Пример #35
0
    def _scrape_bills(self):
        """
        Does the following

        1) Scrapes bill data from unitedstates project and saves the data to path specified in UnitedStates module
        2) Iterates over bill data and converts each one to an OCD-compliant bill model.
        3) Yields the OCD-compliant bill model instance

        @return: generator for federal US bills in OCD-compliant format
        @rtype: generator
        """

        # run scraper first to pull in all the bill data
        self._run_unitedstates_bill_scraper()
        # iterate over all the files and build and yield Bill objects
        for filename in find_files(settings.SCRAPED_DATA_DIR, '.*/data/[0-9]+/bills/[^\/]+/[^\/]+/data.json'):
            try:
                with open(filename) as json_file:
                    json_data = json.load(json_file)

                    # Initialize Object
                    bill = Bill(constants.TYPE_MAP[json_data['bill_type']]['canonical'] + ' ' + json_data['number'],
                                json_data['congress'],
                                json_data['official_title'],
                                chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber']
                    )

                    # add source of data
                    bill.add_source(json_data['url'], note='all')

                    # add subjects
                    for subject in json_data['subjects']:
                        bill.add_subject(subject)

                    # add summary
                    if 'summary' in json_data and json_data['summary'] is not None:
                        bill.add_abstract(json_data['summary']['text'],
                                          json_data['summary']['as'],
                                          json_data['summary']['date'])

                    # add titles
                    for item in json_data['titles']:
                        bill.add_title(item['title'], item['type'])

                    # add other/related Bills
                    for b in json_data['related_bills']:
                        if 'type' in b and b['type'] == 'bill':
                            split = b['bill_id'].split('-')
                            m = UnitedStatesBillScraper.BILL_SPLIT.match(split[0])

                            bill.add_related_bill(constants.TYPE_MAP[m.group(1)]['canonical'] + ' ' + m.group(2),
                                                  legislative_session=split[1],
                                                  relation_type='companion')

                    # add sponsor
                    bill.add_sponsorship_by_identifier(json_data['sponsor']['name'], 'person', 'person', True,
                                                       scheme='thomas_id', identifier=json_data['sponsor']['thomas_id'],
                                                       chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'])

                    # add cosponsors
                    for cs in json_data['cosponsors']:
                        bill.add_sponsorship_by_identifier(cs['name'], 'person', 'person', False,
                                                           scheme='thomas_id', identifier=cs['thomas_id'],
                                                           chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'])

                    # add introduced_at and actions
                    bill.add_action('date of introduction', datetime_to_date(json_data['introduced_at']),
                                    chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'],
                                    related_entities=[])

                    # add other actions
                    for action in json_data['actions']:
                        bill.actions.append({'date': datetime_to_date(action['acted_at']),
                                             'type': [action['type']],
                                             'description': action['text'],
                                             'actor': constants.TYPE_MAP[json_data['bill_type']]['chamber'],
                                             'related_entities': []
                                             })

                    # add bill versions
                    for version_path in find_files(os.path.join(settings.SCRAPED_DATA_DIR,
                                                   'data', bill.legislative_session, 'bills', json_data['bill_type'],
                                                   json_data['bill_type'] + json_data['number'],
                                                   'text-versions'), '/.*/*\.json'):
                        try:
                            with open(version_path) as version_file:
                                version_json_data = json.load(version_file)
                                for k, v in version_json_data['urls'].items():
                                    bill.versions.append({'date': datetime_to_date(version_json_data['issued_on']),
                                      'type': version_json_data['version_code'],
                                      'name': constants.VERSION_MAP[version_json_data['version_code']],
                                      'links': [{'mimetype': k, 'url': v}]})
                        except IOError:
                            print("Unable to open or parse file with path " + version_path)
                            continue

                    # finally yield bill object
                    yield bill

            except IOError:
                print("Unable to open file with path " + filename)
                print(traceback.format_exc())
                continue
            except KeyError:
                print("Unable to parse file with path " + filename)
                print(traceback.format_exc())
                continue
            except:
                print('Unknown error with ' + filename)
                print(traceback.format_exc())
                continue
Пример #36
0
def test_full_bill():
    create_jurisdiction()
    sp = ScrapePerson('Adam Smith')
    org = ScrapeOrganization(name='House', classification='lower')
    com = ScrapeOrganization(name='Arbitrary Committee',
                             classification='committee',
                             parent_id=org._id)

    oldbill = ScrapeBill('HB 99',
                         '1899',
                         'Axe & Tack Tax Act',
                         classification='tax bill',
                         from_organization=org._id)

    bill = ScrapeBill('HB 1',
                      '1900',
                      'Axe & Tack Tax Act',
                      classification='tax bill',
                      from_organization=org._id)
    bill.subject = ['taxes', 'axes']
    bill.add_identifier('SB 9')
    bill.add_title('Tack & Axe Tax Act')
    bill.add_action('introduced in house', '1900-04-01', chamber='lower')
    act = bill.add_action('sent to arbitrary committee',
                          '1900-04-04',
                          chamber='lower')
    act.add_related_entity('arbitrary committee', 'organization', com._id)
    bill.add_related_bill("HB 99",
                          legislative_session="1899",
                          relation_type="prior-session")
    bill.add_sponsorship('Adam Smith',
                         classification='extra sponsor',
                         entity_type='person',
                         primary=False,
                         entity_id=sp._id)
    bill.add_sponsorship('Jane Smith',
                         classification='lead sponsor',
                         entity_type='person',
                         primary=True)
    bill.add_abstract('This is an act about axes and taxes and tacks.',
                      note="official",
                      date='1969-10-20')
    bill.add_document_link('Fiscal Note',
                           'http://example.com/fn.pdf',
                           media_type='application/pdf')
    bill.add_document_link('Fiscal Note',
                           'http://example.com/fn.html',
                           media_type='text/html')
    bill.add_version_link('Fiscal Note',
                          'http://example.com/v/1',
                          media_type='text/html')
    bill.add_source('http://example.com/source')

    # import bill
    oi = OrganizationImporter('jid')
    oi.import_data([org.as_dict(), com.as_dict()])

    pi = PersonImporter('jid')
    pi.import_data([sp.as_dict()])

    BillImporter('jid', oi,
                 pi).import_data([oldbill.as_dict(),
                                  bill.as_dict()])

    # get bill from db and assert it imported correctly
    b = Bill.objects.get(identifier='HB 1')
    assert b.from_organization.classification == 'lower'
    assert b.identifier == bill.identifier
    assert b.title == bill.title
    assert b.classification == bill.classification
    assert b.subject == ['taxes', 'axes']
    assert b.abstracts.get().note == 'official'
    assert b.abstracts.get().date == '1969-10-20'

    # other_title, other_identifier added
    assert b.other_titles.get().title == 'Tack & Axe Tax Act'
    assert b.other_identifiers.get().identifier == 'SB 9'

    # actions
    actions = list(b.actions.all())
    assert len(actions) == 2
    # ensure order was preserved (if this breaks it'll be intermittent)
    assert actions[0].organization == Organization.objects.get(
        classification='lower')
    assert actions[0].description == "introduced in house"
    assert actions[1].description == "sent to arbitrary committee"
    assert (actions[1].related_entities.get().organization ==
            Organization.objects.get(classification='committee'))

    # related_bills were added
    rb = b.related_bills.get()
    assert rb.identifier == 'HB 99'

    # and bill got resolved
    assert rb.related_bill.identifier == 'HB 99'

    # sponsors added, linked & unlinked
    sponsorships = b.sponsorships.all()
    assert len(sponsorships) == 2
    person = Person.objects.get(name='Adam Smith')
    for ss in sponsorships:
        if ss.primary:
            assert ss.person is None
            assert ss.organization is None
        else:
            assert ss.person == person

    # versions & documents with their links
    versions = b.versions.all()
    assert len(versions) == 1
    assert versions[0].links.count() == 1
    documents = b.documents.all()
    assert len(documents) == 1
    assert documents[0].links.count() == 2

    # sources
    assert b.sources.count() == 1
Пример #37
0
def test_full_bill():
    create_jurisdiction()
    person = Person.objects.create(id='person-id', name='Adam Smith')
    org = ScrapeOrganization(name='House', classification='lower')
    com = ScrapeOrganization(name='Arbitrary Committee', classification='committee',
                             parent_id=org._id)

    oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act',
                         classification='tax bill', from_organization=org._id)

    bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act',
                      classification='tax bill', from_organization=org._id)
    bill.subject = ['taxes', 'axes']
    bill.add_identifier('SB 9')
    bill.add_title('Tack & Axe Tax Act')
    bill.add_action('introduced in house', '1900-04-01', chamber='lower')
    act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower')
    act.add_related_entity('arbitrary committee', 'organization', com._id)
    bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session")
    bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person',
                         primary=False, entity_id=person.id)
    bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person',
                         primary=True)
    bill.add_abstract('This is an act about axes and taxes and tacks.', note="official")
    bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf',
                           media_type='application/pdf')
    bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html')
    bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html')
    bill.add_source('http://example.com/source')

    # import bill
    oi = OrganizationImporter('jid')
    oi.import_data([org.as_dict(), com.as_dict()])

    pi = PersonImporter('jid')
    pi.json_to_db_id['person-id'] = 'person-id'
    # Since we have to create this person behind the back of the import
    # transaction, we'll fake the json-id to db-id, since they match in this
    # case. This is *really* getting at some implementation detail, but it's
    # the cleanest way to ensure we short-circut the json id lookup.

    BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()])

    # get bill from db and assert it imported correctly
    b = Bill.objects.get(identifier='HB 1')
    assert b.from_organization.classification == 'lower'
    assert b.identifier == bill.identifier
    assert b.title == bill.title
    assert b.classification == bill.classification
    assert b.subject == ['taxes', 'axes']
    assert b.abstracts.get().note == 'official'

    # other_title, other_identifier added
    assert b.other_titles.get().title == 'Tack & Axe Tax Act'
    assert b.other_identifiers.get().identifier == 'SB 9'

    # actions
    actions = list(b.actions.all())
    assert len(actions) == 2
    # ensure order was preserved (if this breaks it'll be intermittent)
    assert actions[0].organization == Organization.objects.get(classification='lower')
    assert actions[0].description == "introduced in house"
    assert actions[1].description == "sent to arbitrary committee"
    assert (actions[1].related_entities.get().organization ==
            Organization.objects.get(classification='committee'))

    # related_bills were added
    rb = b.related_bills.get()
    assert rb.identifier == 'HB 99'

    # and bill got resolved
    assert rb.related_bill.identifier == 'HB 99'

    # sponsors added, linked & unlinked
    sponsorships = b.sponsorships.all()
    assert len(sponsorships) == 2
    for ss in sponsorships:
        if ss.primary:
            assert ss.person is None
            assert ss.organization is None
        else:
            assert ss.person == person

    # versions & documents with their links
    versions = b.versions.all()
    assert len(versions) == 1
    assert versions[0].links.count() == 1
    documents = b.documents.all()
    assert len(documents) == 1
    assert documents[0].links.count() == 2

    # sources
    assert b.sources.count() == 1
Пример #38
0
    def scrape_bills(self, session):
        session_key = SESSION_KEYS[session]
        measures_response = self.api_client.get("measures",
                                                page=500,
                                                session=session_key)

        legislators = index_legislators(self, session_key)

        for measure in measures_response:
            bid = "{} {}".format(measure["MeasurePrefix"],
                                 measure["MeasureNumber"])

            chamber = self.chamber_code[bid[0]]
            bill = Bill(
                bid.replace(" ", ""),
                legislative_session=session,
                chamber=chamber,
                title=measure["RelatingTo"],
                classification=self.bill_types[measure["MeasurePrefix"][1:]],
            )
            bill.add_abstract(measure["MeasureSummary"].strip(),
                              note="summary")

            for sponsor in measure["MeasureSponsors"]:
                legislator_code = sponsor["LegislatoreCode"]  # typo in API
                if legislator_code:
                    try:
                        legislator = legislators[legislator_code]
                    except KeyError:
                        logger.warn(
                            "Legislator {} not found in session {}".format(
                                legislator_code, session))
                        legislator = legislator_code
                    bill.add_sponsorship(
                        name=legislator,
                        classification={
                            "Chief": "primary",
                            "Regular": "cosponsor"
                        }[sponsor["SponsorLevel"]],
                        entity_type="person",
                        primary=True
                        if sponsor["SponsorLevel"] == "Chief" else False,
                    )

            bill.add_source(
                "https://olis.leg.state.or.us/liz/{session}/Measures/Overview/{bid}"
                .format(session=session_key, bid=bid.replace(" ", "")))
            for document in measure["MeasureDocuments"]:
                # TODO: probably mixing documents & versions here - should revisit
                try:
                    bill.add_version_link(
                        document["VersionDescription"],
                        document["DocumentUrl"],
                        media_type="application/pdf",
                    )
                except ValueError:
                    logger.warn("Duplicate link found for {}".format(
                        document["DocumentUrl"]))

            for agenda_item in measure["CommitteeAgendaItems"]:
                for document in agenda_item["CommitteeProposedAmendments"]:
                    if "adopted" in document["Meaning"].lower():
                        amd_name = "{} Amendment {}".format(
                            document["CommitteeCode"],
                            document["AmendmentNumber"])
                        bill.add_version_link(
                            amd_name,
                            document["ProposedAmendmentUrl"],
                            media_type="application/pdf",
                            on_duplicate="ignore",
                        )

            for action in measure["MeasureHistoryActions"]:
                classifiers = self.determine_action_classifiers(
                    action["ActionText"])
                when = datetime.datetime.strptime(action["ActionDate"],
                                                  "%Y-%m-%dT%H:%M:%S")
                when = self.tz.localize(when)
                bill.add_action(
                    action["ActionText"],
                    when,
                    chamber=self.chamber_code[action["Chamber"]],
                    classification=classifiers,
                )

            yield bill
Пример #39
0
    def scrape_bill(self, row, session):
        bill_id = row['LegislationDisplayCode']

        amendment = None
        substitute = None

        if bill_id.count(' ') > 1:
            if ' w/ ' in bill_id:
                self.info('Found amended bill `{}`'.format(bill_id))
                bill_id, amendment = bill_id.split(' w/ ')
            # A bill can _both_ be amended and be substituted
            if ' for ' in bill_id:
                self.info("Found substitute to use instead: `{}`".format(bill_id))
                substitute, bill_id = bill_id.split(' for ')
            if amendment is None and substitute is None:
                raise ValueError('unknown bill_id format: ' + bill_id)

        bill_type = self.classify_bill(bill_id)
        chamber = 'upper' if bill_id.startswith('S') else 'lower'

        bill = Bill(identifier=bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=row['LongTitle'],
                    classification=bill_type)
        if row['Synopsis']:
            bill.add_abstract(row['Synopsis'], 'synopsis')
        if row['ShortTitle']:
            bill.add_title(row['ShortTitle'], 'short title')
        if row['SponsorPersonId']:
            self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary')
        if substitute:
            bill.extras['substitute'] = substitute
        if amendment:
            bill.extras['amendment'] = amendment

        # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API?
        html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format(
            row['LegislationId']
        )
        bill.add_source(html_url, note='text/html')

        html = self.lxmlize(html_url)

        additional_sponsors = html.xpath('//label[text()="Additional Sponsor(s):"]'
                                         '/following-sibling::div/a/@href')
        for sponsor_url in additional_sponsors:
            sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?'
                                             'personId=', '')
            self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary')

        cosponsors = html.xpath('//label[text()="Co-Sponsor(s):"]/'
                                'following-sibling::div/a/@href')
        for sponsor_url in cosponsors:
            sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?'
                                             'personId=', '')
            self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor')

        versions = html.xpath('//label[text()="Original Text:"]/following-sibling::div/a/@href')
        for version_url in versions:
            media_type = self.mime_from_link(version_url)
            version_name = 'Bill Text'
            bill.add_version_link(version_name, version_url, media_type=media_type)

        fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href')
        for fiscal in fiscals:
            self.scrape_fiscal_note(bill, fiscal)

        self.scrape_actions(bill, row['LegislationId'])

        if row['HasAmendments'] is True:
            self.scrape_amendments(bill, row['LegislationId'])

        yield from self.scrape_votes(bill, row['LegislationId'], session)

        yield bill
Пример #40
0
    def scrape(self, session=None, chamber=None):
        bill_type_map = {
            'B': 'bill',
            'R': 'resolution',
            'JR': 'joint resolution',
            'CR': 'concurrent resolution',
        }

        chamber_map = {
            'H': 'lower',
            'S': 'upper',
            'J': 'joint',
            'E': 'legislature',  # Effective date
        }

        action_code_map = {
            'HI': None,
            'SI': None,
            'HH': None,
            'SH': None,
            'HPF': ['introduction'],
            'HDSAS': None,
            'SPF': ['introduction'],
            'HSR': ['reading-2'],
            'SSR': ['reading-2'],
            'HFR': ['reading-1'],
            'SFR': ['reading-1'],
            'HRECM': ['withdrawal', 'referral-committee'],
            'SRECM': ['withdrawal', 'referral-committee'],
            'SW&C': ['withdrawal', 'referral-committee'],
            'HW&C': ['withdrawal', 'referral-committee'],
            'HRA': ['passage'],
            'SRA': ['passage'],
            'HPA': ['passage'],
            'HRECO': None,
            'SPA': ['passage'],
            'HTABL': None,  # 'House Tabled' - what is this?
            'SDHAS': None,
            'HCFR': ['committee-passage-favorable'],
            'SCFR': ['committee-passage-favorable'],
            'HRAR': ['referral-committee'],
            'SRAR': ['referral-committee'],
            'STR': ['reading-3'],
            'SAHAS': None,
            'SE': ['passage'],
            'SR': ['referral-committee'],
            'HTRL': ['reading-3', 'failure'],
            'HTR': ['reading-3'],
            'S3RLT': ['reading-3', 'failure'],
            'HASAS': None,
            'S3RPP': None,
            'STAB': None,
            'SRECO': None,
            'SAPPT': None,
            'HCA': None,
            'HNOM': None,
            'HTT': None,
            'STT': None,
            'SRECP': None,
            'SCRA': None,
            'SNOM': None,
            'S2R': ['reading-2'],
            'H2R': ['reading-2'],
            'SENG': ['passage'],
            'HENG': ['passage'],
            'HPOST': None,
            'HCAP': None,
            'SDSG': ['executive-signature'],
            'SSG': ['executive-receipt'],
            'Signed Gov': ['executive-signature'],
            'HDSG': ['executive-signature'],
            'HSG': ['executive-receipt'],
            'EFF': None,
            'HRP': None,
            'STH': None,
            'HTS': None,
        }

        if not session:
            session = self.latest_session()
            self.info('no session specified, using %s', session)
        sid = SESSION_SITE_IDS[session]

        legislation = backoff(self.lservice.GetLegislationForSession,
                              sid)['LegislationIndex']

        for leg in legislation:
            lid = leg['Id']
            instrument = backoff(self.lservice.GetLegislationDetail, lid)
            history = [x for x in instrument['StatusHistory'][0]]

            actions = reversed([{
                'code': x['Code'],
                'action': x['Description'],
                '_guid': x['Id'],
                'date': x['Date']
            } for x in history])

            guid = instrument['Id']

            # A little bit hacky.
            bill_prefix = instrument['DocumentType']
            bill_chamber = chamber_map[bill_prefix[0]]
            bill_type = bill_type_map[bill_prefix[1:]]

            bill_id = '%s %s' % (
                bill_prefix,
                instrument['Number'],
            )
            if instrument['Suffix']:
                bill_id += instrument['Suffix']

            title = instrument['Caption']
            description = instrument['Summary']

            if title is None:
                continue

            bill = Bill(bill_id,
                        legislative_session=session,
                        chamber=bill_chamber,
                        title=title,
                        classification=bill_type)
            bill.add_abstract(description, note='description')
            bill.extras = {'guid': guid}

            if instrument['Votes']:
                for vote_ in instrument['Votes']:
                    _, vote_ = vote_
                    vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId'])

                    vote = VoteEvent(
                        start_date=vote_['Date'].strftime('%Y-%m-%d'),
                        motion_text=vote_['Caption'] or 'Vote on Bill',
                        chamber={
                            'House': 'lower',
                            'Senate': 'upper'
                        }[vote_['Branch']],
                        result='pass'
                        if vote_['Yeas'] > vote_['Nays'] else 'fail',
                        classification='passage',
                        bill=bill,
                    )
                    vote.set_count('yes', vote_['Yeas'])
                    vote.set_count('no', vote_['Nays'])
                    vote.set_count('other',
                                   vote_['Excused'] + vote_['NotVoting'])

                    vote.add_source(self.vsource)

                    methods = {'Yea': 'yes', 'Nay': 'no'}

                    for vdetail in vote_['Votes'][0]:
                        whom = vdetail['Member']
                        how = vdetail['MemberVoted']
                        vote.vote(methods.get(how, 'other'), whom['Name'])

                    yield vote

            ccommittees = defaultdict(list)
            committees = instrument['Committees']
            if committees:
                for committee in committees[0]:
                    ccommittees[{
                        'House': 'lower',
                        'Senate': 'upper',
                    }[committee['Type']]].append(committee['Name'])

            for action in actions:
                action_chamber = chamber_map[action['code'][0]]

                try:
                    action_types = action_code_map[action['code']]
                except KeyError:
                    error_msg = 'Code {code} for action {action} not recognized.'.format(
                        code=action['code'], action=action['action'])

                    self.logger.warning(error_msg)

                    action_types = None

                committees = []
                if action_types and any(
                    ('committee' in x for x in action_types)):
                    committees = [
                        str(x) for x in ccommittees.get(action_chamber, [])
                    ]

                act = bill.add_action(action['action'],
                                      action['date'].strftime('%Y-%m-%d'),
                                      classification=action_types,
                                      chamber=action_chamber)
                for committee in committees:
                    act.add_related_entity(committee, 'organization')
                act.extras = {
                    'code': action['code'],
                    'guid': action['_guid'],
                }

            sponsors = []
            if instrument['Authors']:
                sponsors = instrument['Authors']['Sponsorship']
                if 'Sponsors' in instrument and instrument['Sponsors']:
                    sponsors += instrument['Sponsors']['Sponsorship']

            sponsors = [(x['Type'], self.get_member(x['MemberId']))
                        for x in sponsors]

            for typ, sponsor in sponsors:
                name = '{First} {Last}'.format(**dict(sponsor['Name']))
                bill.add_sponsorship(
                    name,
                    entity_type='person',
                    classification='primary'
                    if 'Author' in typ else 'secondary',
                    primary='Author' in typ,
                )

            for version in instrument['Versions']['DocumentDescription']:
                name, url, doc_id, version_id = [
                    version[x]
                    for x in ['Description', 'Url', 'Id', 'Version']
                ]
                link = bill.add_version_link(name,
                                             url,
                                             media_type='application/pdf')
                link['extras'] = {
                    '_internal_document_id': doc_id,
                    '_version_id': version_id
                }

            bill.add_source(self.msource)
            bill.add_source(self.lsource)
            bill.add_source(
                SOURCE_URL.format(**{
                    'session': session,
                    'bid': guid,
                }))

            yield bill
Пример #41
0
    def scrape_bill(self, bill_num, session):
        chamber_map = {'House': 'lower', 'Senate': 'upper', 'LSO': 'executive'}
        # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45
        bill_json_url = 'http://wyoleg.gov/LsoService/api/BillInformation/{}/' \
                        '{}?calendarDate='.format(
                            session, bill_num)
        response = self.get(bill_json_url)
        bill_json = json.loads(response.content.decode('utf-8'))

        chamber = 'lower' if bill_json['bill'][0] else 'upper'

        bill = Bill(
            identifier=bill_json['bill'],
            legislative_session=session,
            title=bill_json['catchTitle'],
            chamber=chamber,
            classification="bill",
        )

        bill.add_title(bill_json['billTitle'])

        source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format(
            session, bill_json['bill'])
        bill.add_source(source_url)

        for action_json in bill_json['billActions']:
            utc_action_date = self.parse_local_date(action_json['statusDate'])

            actor = None
            if action_json['location'] and action_json[
                    'location'] in chamber_map:
                actor = chamber_map[action_json['location']]

            action = bill.add_action(
                chamber=actor,
                description=action_json['statusMessage'],
                date=utc_action_date,
                classification=categorize_action(action_json['statusMessage']),
            )

            action.extras = {
                'billInformationID': action_json['billInformationID']
            }

        if bill_json['introduced']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['introduced'])

            bill.add_version_link(
                note="Introduced",
                url=url,
                media_type="application/pdf"  # optional but useful!
            )

        if bill_json['enrolledAct']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['enrolledAct'])

            bill.add_version_link(
                note="Enrolled",
                url=url,
                media_type="application/pdf"  # optional but useful!
            )

        if bill_json['fiscalNote']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['fiscalNote'])

            bill.add_document_link(
                note="Fiscal Note",
                url=url,
                media_type="application/pdf"  # optional but useful!
            )

        if bill_json['digest']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['digest'])

            bill.add_document_link(
                note="Bill Digest",
                url=url,
                media_type="application/pdf"  # optional but useful!
            )

        if bill_json['vetoes']:
            for veto in bill_json['vetoes']:
                url = 'http://wyoleg.gov/{}'.format(veto['vetoLinkPath'])
                bill.add_version_link(
                    note=veto['vetoLinkText'],
                    url=url,
                    media_type="application/pdf"  # optional but useful!
                )

        for amendment in bill_json['amendments']:
            # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf
            url = 'http://wyoleg.gov/{}/Amends/{}.pdf'.format(
                session, amendment['amendmentNumber'])

            if amendment['sponsor'] and amendment['status']:
                title = 'Amendment {} ({}) - {} ({})'.format(
                    amendment['amendmentNumber'],
                    amendment['order'],
                    amendment['sponsor'],
                    amendment['status'],
                )
            else:
                title = 'Amendment {} ({})'.format(
                    amendment['amendmentNumber'],
                    amendment['order'],
                )
            # add versions of the bill text
            version = bill.add_version_link(
                note=title,
                url=url,
                media_type="application/pdf",
            )
            version['extras'] = {
                'amendmentNumber': amendment['amendmentNumber'],
                'sponsor': amendment['sponsor'],
            }

        for sponsor in bill_json['sponsors']:
            status = 'primary' if sponsor['primarySponsor'] else 'cosponsor'
            sponsor_type = 'person' if sponsor[
                'sponsorTitle'] else 'organization'
            bill.add_sponsorship(name=sponsor['name'],
                                 classification=status,
                                 entity_type=sponsor_type,
                                 primary=sponsor['primarySponsor'])

        if bill_json['summary']:
            bill.add_abstract(
                note="summary",
                abstract=bill_json['summary'],
            )

        if bill_json['enrolledNumber']:
            bill.extras['wy_enrolled_number'] = bill_json['enrolledNumber']

        if bill_json['chapter']:
            bill.extras['chapter'] = bill_json['chapter']

        if bill_json['effectiveDate']:
            eff = datetime.datetime.strptime(bill_json['effectiveDate'],
                                             '%m/%d/%Y')
            bill.extras['effective_date'] = eff.strftime('%Y-%m-%d')

        bill.extras['wy_bill_id'] = bill_json['id']

        for vote_json in bill_json['rollCalls']:
            yield from self.scrape_vote(bill, vote_json, session)

        yield bill
Пример #42
0
    def _scrape_bill(self, session, bill_data):
        details = self._parse_bill_details(bill_data)

        (senate_url, assembly_url, bill_chamber, bill_type, bill_id,
         title, (prefix, number, active_version)) = details

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=bill_chamber,
            title=title or bill_data['summary'],
            classification=bill_type,
        )

        if bill_data['summary']:
            bill.add_abstract(bill_data['summary'], note='')

        bill_active_version = bill_data['amendments']['items'][active_version]

        # Parse sponsors.
        if bill_data['sponsor'] is not None:
            if bill_data['sponsor']['rules'] is True:
                bill.add_sponsorship(
                    'Rules Committee',
                    entity_type='organization',
                    classification='primary',
                    primary=True,
                )
            elif not bill_data['sponsor']['budget']:
                primary_sponsor = bill_data['sponsor']['member']
                bill.add_sponsorship(
                    primary_sponsor['shortName'],
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

                # There *shouldn't* be cosponsors if there is no sponsor.
                cosponsors = bill_active_version['coSponsors']['items']
                for cosponsor in cosponsors:
                    bill.add_sponsorship(
                        cosponsor['shortName'],
                        entity_type='person',
                        classification='cosponsor',
                        primary=False,
                    )

        # List companion bill.
        same_as = bill_active_version.get('sameAs', {})
        # Check whether "sameAs" property is populated with at least one bill.
        if same_as['items']:
            # Get companion bill ID.
            companion_bill_id = same_as['items'][0]['basePrintNo']

            # Build companion bill session.
            start_year = same_as['items'][0]['session']
            end_year = start_year + 1
            companion_bill_session = '-'.join([str(start_year), str(end_year)])

            # Attach companion bill data.
            bill.add_related_bill(
                companion_bill_id,
                companion_bill_session,
                relation_type='companion',
            )

        # Parse actions.
        chamber_map = {
            'senate': 'upper',
            'assembly': 'lower',
        }

        for action in bill_data['actions']['items']:
            chamber = chamber_map[action['chamber'].lower()]
            action_datetime = datetime.datetime.strptime(action['date'], '%Y-%m-%d')
            action_date = action_datetime.date()
            types, _ = NYBillScraper.categorizer.categorize(action['text'])

            bill.add_action(
                action['text'],
                action_date.strftime('%Y-%m-%d'),
                chamber=chamber,
                classification=types,
            )

        # Handling of sources follows. Sources serving either chamber
        # maintain duplicate data, so we can see certain bill data
        # through either chamber's resources. However, we have to refer
        # to a specific chamber's resources if we want to grab certain
        # specific information such as vote data.
        #
        # As such, I'm placing all potential sources in the interest of
        # thoroughness. - Andy Lo

        # List Open Legislation API endpoint as a source.
        api_url = self.api_client.root + self.api_client.resources['bill'].format(
            session_year=session,
            bill_id=bill_id,
            summary='',
            detail='')
        bill.add_source(api_url)
        bill.add_source(senate_url)
        bill.add_source(assembly_url)

        # Chamber-specific processing.
        if bill_chamber == 'upper':
            # Collect votes.
            for vote_data in bill_data['votes']['items']:
                yield self._parse_senate_votes(vote_data, bill, api_url)
        elif bill_chamber == 'lower':
            assembly = AssemblyBillPage(self, session, bill, details)
            assembly.build()

        # A little strange the way it works out, but the Assembly
        # provides the HTML version documents and the Senate provides
        # the PDF version documents.
        amendments = bill_data['amendments']['items']
        for key, amendment in amendments.items():
            version = amendment['printNo']

            html_version = version + ' HTML'
            html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\
                '{}&term={}'.format(bill_id, self.term_start_year)
            bill.add_version_link(
                html_version,
                html_url,
                on_duplicate='ignore',
                media_type='text/html',
            )

            pdf_version = version + ' PDF'
            pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\
                .format(self.term_start_year, bill_id)
            bill.add_version_link(
                pdf_version,
                pdf_url,
                on_duplicate='ignore',
                media_type='application/pdf',
            )

        yield bill
Пример #43
0
    def scrape_bill(self, chamber, session, doc_type, url, bill_type=None):
        try:
            html = self.get(url).text
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(url)
        except scrapelib.HTTPError as e:
            assert '500' in e.args[0], "Unexpected error when accessing page: {}".format(e)
            self.warning("500 error for bill page; skipping bill")
            return

        # bill id, title, summary
        bill_num = re.findall(r'DocNum=(\d+)', url)[0]
        bill_type = bill_type or DOC_TYPES[doc_type[1:]]
        bill_id = doc_type + bill_num

        title = doc.xpath('//span[text()="Short Description:"]/following-sibling::span[1]/'
                          'text()')[0].strip()
        summary = doc.xpath('//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/'
                            'text()')[0].strip()

        bill = Bill(identifier=bill_id,
                    legislative_session=session,
                    title=title,
                    classification=bill_type,
                    chamber=chamber)

        bill.add_abstract(summary, note='')

        bill.add_source(url)
        # sponsors
        sponsor_list = build_sponsor_list(doc.xpath('//a[@class="content"]'))
        # don't add just yet; we can make them better using action data

        committee_actors = {}

        # actions
        action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td')
        for date, actor, action_elem in group(action_tds, 3):
            date = datetime.datetime.strptime(date.text_content().strip(),
                                              "%m/%d/%Y")
            date = self.localize(date).date()
            actor = actor.text_content()
            if actor == 'House':
                actor_id = {'classification': 'lower'}
            elif actor == 'Senate':
                actor_id = {'classification': 'upper'}

            action = action_elem.text_content()
            classification, related_orgs = _categorize_action(action)

            if (related_orgs and any(c.startswith('committee') for c in classification)):
                (name, source), = [(a.text, a.get('href')) for a in
                                   action_elem.xpath('a')
                                   if 'committee' in a.get('href')]
                source = canonicalize_url(source)
                actor_id = {'sources__url': source,
                            'classification': 'committee'}
                committee_actors[source] = name

            bill.add_action(action, date,
                            organization=actor_id,
                            classification=classification,
                            related_entities=related_orgs)

            if action.lower().find('sponsor') != -1:
                self.refine_sponsor_list(actor, action, sponsor_list, bill_id)

        # now add sponsors
        for spontype, sponsor, chamber, official_type in sponsor_list:
            if official_type == 'primary':
                primary = True
            else:
                primary = False
            if chamber:
                bill.add_sponsorship(sponsor, spontype, 'person',
                                     primary=primary,
                                     chamber=chamber)
            else:
                bill.add_sponsorship(spontype, sponsor, 'person',
                                     primary=primary)

        # versions
        version_url = doc.xpath('//a[text()="Full Text"]/@href')[0]
        self.scrape_documents(bill, version_url)
        yield bill

        votes_url = doc.xpath('//a[text()="Votes"]/@href')[0]
        yield from self.scrape_votes(session, bill, votes_url, committee_actors)
Пример #44
0
    def scrape_bill(self, bill_num, session):
        chamber_map = {"House": "lower", "Senate": "upper", "LSO": "executive"}
        # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45
        bill_json_url = ("http://wyoleg.gov/LsoService/api/BillInformation/{}/"
                         "{}?calendarDate=".format(session, bill_num))
        response = self.get(bill_json_url)
        bill_json = json.loads(response.content.decode("utf-8"))

        chamber = "lower" if bill_json["bill"][0] else "upper"

        bill = Bill(
            identifier=bill_json["bill"],
            legislative_session=session,
            title=bill_json["catchTitle"],
            chamber=chamber,
            classification="bill",
        )

        bill.add_title(bill_json["billTitle"])

        source_url = "http://lso.wyoleg.gov/Legislation/{}/{}".format(
            session, bill_json["bill"])
        bill.add_source(source_url)

        for action_json in bill_json["billActions"]:
            utc_action_date = self.parse_local_date(action_json["statusDate"])

            actor = None
            if action_json["location"] and action_json[
                    "location"] in chamber_map:
                actor = chamber_map[action_json["location"]]

            action = bill.add_action(
                chamber=actor,
                description=action_json["statusMessage"],
                date=utc_action_date,
                classification=categorize_action(action_json["statusMessage"]),
            )

            action.extras = {
                "billInformationID": action_json["billInformationID"]
            }

        if bill_json["introduced"]:
            url = "http://wyoleg.gov/{}".format(bill_json["introduced"])

            bill.add_version_link(
                note="Introduced",
                url=url,
                media_type="application/pdf",  # optional but useful!
            )

        if bill_json["enrolledAct"]:
            url = "http://wyoleg.gov/{}".format(bill_json["enrolledAct"])

            bill.add_version_link(
                note="Enrolled",
                url=url,
                media_type="application/pdf",  # optional but useful!
            )

        if bill_json["fiscalNote"]:
            url = "http://wyoleg.gov/{}".format(bill_json["fiscalNote"])

            bill.add_document_link(
                note="Fiscal Note",
                url=url,
                media_type="application/pdf",  # optional but useful!
            )

        if bill_json["digest"]:
            url = "http://wyoleg.gov/{}".format(bill_json["digest"])

            bill.add_document_link(
                note="Bill Digest",
                url=url,
                media_type="application/pdf",  # optional but useful!
            )

        if bill_json["vetoes"]:
            for veto in bill_json["vetoes"]:
                url = "http://wyoleg.gov/{}".format(veto["vetoLinkPath"])
                bill.add_version_link(
                    note=veto["vetoLinkText"],
                    url=url,
                    media_type="application/pdf",  # optional but useful!
                )

        for amendment in bill_json["amendments"]:
            # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf
            url = "http://wyoleg.gov/{}/Amends/{}.pdf".format(
                session, amendment["amendmentNumber"])

            if amendment["sponsor"] and amendment["status"]:
                title = "Amendment {} ({}) - {} ({})".format(
                    amendment["amendmentNumber"],
                    amendment["order"],
                    amendment["sponsor"],
                    amendment["status"],
                )
            else:
                title = "Amendment {} ({})".format(
                    amendment["amendmentNumber"], amendment["order"])
            # add versions of the bill text
            version = bill.add_version_link(note=title,
                                            url=url,
                                            media_type="application/pdf")
            version["extras"] = {
                "amendmentNumber": amendment["amendmentNumber"],
                "sponsor": amendment["sponsor"],
            }

        for sponsor in bill_json["sponsors"]:
            status = "primary" if sponsor["primarySponsor"] else "cosponsor"
            sponsor_type = "person" if sponsor[
                "sponsorTitle"] else "organization"
            bill.add_sponsorship(
                name=sponsor["name"],
                classification=status,
                entity_type=sponsor_type,
                primary=sponsor["primarySponsor"],
            )

        if bill_json["summary"]:
            bill.add_abstract(note="summary", abstract=bill_json["summary"])

        if bill_json["enrolledNumber"]:
            bill.extras["wy_enrolled_number"] = bill_json["enrolledNumber"]

        if bill_json["chapter"]:
            bill.extras["chapter"] = bill_json["chapter"]

        if bill_json["effectiveDate"]:
            eff = datetime.datetime.strptime(bill_json["effectiveDate"],
                                             "%m/%d/%Y")
            bill.extras["effective_date"] = eff.strftime("%Y-%m-%d")

        bill.extras["wy_bill_id"] = bill_json["id"]

        for vote_json in bill_json["rollCalls"]:
            yield from self.scrape_vote(bill, vote_json, session)

        yield bill
Пример #45
0
    def get_bill(self, matter):
        '''Make Bill object from given matter.'''
        '''
        Currently, NYC Legistar does not have conventional "Types" for 
        three newly added committees: https://legistar.council.nyc.gov/Departments.aspx
        We communicated the issue to NYC, and until we learn more, we will
        skip the bills attached to those committees.
        '''
        orgs_without_type = [
            'Charter Revision Commission 2019',
            'New York City Advisory Commission on Property Tax Reform',
            'Democratic Conference of the Council of the City of New York'
        ]
        if matter['MatterBodyName'].strip() in orgs_without_type:
            return None

        matter_id = matter['MatterId']
        if matter_id in DUPLICATED_ACTIONS:
            return None

        date = matter['MatterIntroDate']
        title = matter['MatterName']
        identifier = matter['MatterFile']

        if not all((date, title, identifier)):
            return None

        leg_type = BILL_TYPES[matter['MatterTypeName']]

        bill_session = self.sessions(self.toTime(date))

        bill = Bill(identifier=identifier,
                    title=title,
                    classification=leg_type,
                    legislative_session=bill_session,
                    from_organization={"name": "New York City Council"})

        legistar_web = matter['legistar_url']
        legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

        bill.add_source(legistar_web, note='web')
        bill.add_source(legistar_api, note='api')

        if matter['MatterTitle']:
            bill.add_title(matter['MatterTitle'])

        if matter['MatterEXText5']:
            bill.add_abstract(matter['MatterEXText5'], note='')

        try:
            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        for attachment in self.attachments(matter_id):

            if attachment['MatterAttachmentId'] == 103315:  # Duplicate
                return None

            if attachment['MatterAttachmentName']:
                bill.add_document_link(attachment['MatterAttachmentName'],
                                       attachment['MatterAttachmentHyperlink'],
                                       media_type='application/pdf')

        for topic in self.topics(matter_id):
            bill.add_subject(topic['MatterIndexName'].strip())

        for relation in self.relations(matter_id):
            try:
                related_bill = self.endpoint(
                    '/matters/{0}', relation['MatterRelationMatterId'])
            except scrapelib.HTTPError:
                return None
            else:
                date = related_bill['MatterIntroDate']
                related_bill_session = self.session(self.toTime(date))
                identifier = related_bill['MatterFile']
                bill.add_related_bill(identifier=identifier,
                                      legislative_session=related_bill_session,
                                      relation_type='companion')

        try:
            text = self.text(matter_id)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        bill.extras['local_classification'] = matter['MatterTypeName']

        if text:
            if text['MatterTextPlain']:
                bill.extras['plain_text'] = text['MatterTextPlain'].replace(
                    u'\u0000', '')

            if text['MatterTextRtf']:
                bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                    u'\u0000', '')

        return bill
Пример #46
0
    def scrape_bill(self, bill_url, bill_id, session_id):
        page = self.lxmlize(bill_url)

        # create bill
        title = page.xpath("//em/text()")[0]
        bill = Bill(identifier=bill_id,
                    legislative_session=session_id,
                    title=title)
        bill.add_source(bill_url, note="detail")

        # add additional fields

        # abstract
        try:
            # abstract is directly above <h2>Legislative History</h2>
            leg_his = page.xpath("//h2[text()='Legislative History']")[0]
            abstract = leg_his.xpath("preceding-sibling::p/text()")[0]
            bill.add_abstract(abstract=abstract.strip(), note="summary")
            # TODO trim whitespace from summary
        except IndexError:
            print("No abstract for bill {} in session {}".format(
                bill_id, session_id))

        # the rest of the fields are found inside this <table>
        data_table = page.xpath("//table[@class='data vertical_table']")[0]

        # sponsor
        sponsor_name = data_table.xpath(
            self.bill_table_query("Sponsor") + "/text()")[0]
        bill.add_sponsorship(name=sponsor_name,
                             classification="Primary",
                             entity_type="person",
                             primary=True)

        # actions
        action_lines = data_table.xpath(
            self.bill_table_query("Actions") + "/text()")
        for line in action_lines:
            try:
                for date_str, action_type in self.parse_actions(line):
                    bill.add_action(date=date_str,
                                    description=action_type,
                                    classification=action_type)
            except ValueError:
                print("failed to parse these actions: {}".format([line]))

        # co-sponsors
        co_sponsors = data_table.xpath(
            self.bill_table_query("Co-Sponsors") + "/text()")
        co_sponsors = [name.strip() for name in co_sponsors if name.strip()]
        for name in co_sponsors:
            bill.add_sponsorship(name=name,
                                 classification="co-sponsor",
                                 entity_type="person",
                                 primary=False)

        # committee (stored as another sponsorship in OCD)
        committees = data_table.xpath(
            self.bill_table_query("Committee") + "/a/text()")
        for comm in committees:
            bill.add_sponsorship(
                name=comm,
                classification="secondary",  # classification ?
                entity_type="organization",
                primary=False)

        return bill
Пример #47
0
    def scrape_matter(self, matter_link, sess):
        matter_types = {
        "Additions":"other",
        "Administrative Order":"order",
        "Annual Evaluation":"other",
        "Bid Advertisement":"other",
        "Bid Awards":"other",
        "Bid Contract":"contract",
        "Bid Protest":"other",
        "Bid Rejection":"other",
        "Birthday Scroll":"commemoration",
        "Certificate of Appreciation":"commemoration",
        "Change Order":"order",
        "Citizen's Presentation":"other",
        "Commendation":"commemoration",
        "Conflict Waiver":"other",
        "Congratulatory Certificate":"commemoration",
        "Deferrals":"other",
        "Discussion Item":"other",
        "Distinguished Visitor":"other",
        "Joint Meeting/Workshop":"other",
        "Mayoral Veto":"other",
        "Miscellaneous":"other",
        "Nomination":"nomination",
        "Oath of Office":"other",
        "Omnibus Reserve":"bill",
        "Ordinance":"ordinance",
        "Plaque":"commemoration",
        "Presentation":"other",
        "Proclamation":"proclamation",
        "Professional Service Agreement":"contract",
        "Public Hearing":"other",
        "Report":"other",
        "Request for Proposals":"other",
        "Request for Qualifications":"other",
        "Request to Advertise":"other",
        "Resolution":"resolution",
        "Resolution of Sympathy":"resolution",
        "Service Awards":"commemoration",
        "Special Item":"other",
        "Special Presentation":"other",
        "Supplement":"other",
        "Swearing-In":"other",
        "Time Sensitive Items":"other",
        "Withdrawals":"other",
        "Workshop Item":"other",
        "Zoning":"other",
        "Zoning Resolution":"resolution"
        }
        matter_doc = self.lxmlize(matter_link)
        info_dict = self.matter_table_to_dict(matter_doc)
        #we're going to use the year of the intro date as the session
        #until/unless we come up with something better
        intro_date = datetime.strptime(info_dict["Introduced"],"%m/%d/%Y")
        session = sess["identifier"]
        category = matter_types[info_dict["File Type"]]
        if 'File Name' in info_dict:
            title = info_dict["File Name"]
        elif "Title" in info_dict and info_dict["Title"].strip():
            title = info_dict["Title"].strip()
        else:
            self.warning("bill has no title")
            return
        if category == 'other':
            bill = Bill(identifier=info_dict["File Number"],
                legislative_session=session,
                title=title
                )
        else:
            bill = Bill(identifier=info_dict["File Number"],
                legislative_session=session,
                title=title,
                classification=category
                )
        for spons in info_dict["Sponsors"]:
            if spons == "NONE":
                continue
            try:
                name,spons_type = spons.rsplit(",",1)
            except ValueError:
                name = spons
                spons_type = "Sponsor"
            primary = True if "Prime Sponsor" in spons_type else False
            entity = "person"
            if "committee" in name:
                entity = committee
            bill.add_sponsorship(name,spons_type,entity,primary)
        if "Indexes" in info_dict:
            for subj in info_dict["Indexes"]:
                if subj.strip() and subj.strip() != "NONE":
                    bill.add_subject(subj.strip())
        if "Title" in info_dict and info_dict["Title"].strip():
            note = "bill's long title'"
            if ("Note" in info_dict and info_dict["Note"].strip()):
                note = info_dict["Note"]
            bill.add_abstract(abstract=info_dict["Title"],note=note)
        self.process_action_table(matter_doc,bill)
        bill.add_source(matter_link, note='web')

        yield bill
Пример #48
0
    def scrape_bill_type(self, chamber, session, bill_type, type_abbr,
                         committee_abbr_regex=get_committee_name_regex()):
        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(
            measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_id, session, title='', chamber=chamber)
            if ((bill_id.startswith('S') and chamber == 'lower') or
                    (bill_id.startswith('A') and chamber == 'upper')):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ('http://leginfo.legislature.ca.gov/faces/'
                          'billNavClient.xhtml?bill_id=%s') % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id, source_url, media_type='text/html')

            title = ''
            type_ = ['bill']
            subject = ''
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = '//caml:DigestText/xhtml:p'
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r'\s+', ' ', t)
                    t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t)
                    chunks.append(t)
                summary = '\n\n'.join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime(
                    '%m/%d/%y')
                version_name = "{} - {}".format(
                    version_date_human, version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(
                    version_name,
                    version_url_pdf,
                    media_type='application/pdf',
                    date=version_date.date())

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ('AB', 'SB'):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(version.short_title) and \
                            not version.title.lower().startswith('an act'):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == 'Yes':
                    type_.append('appropriation')

                tags = []
                if version.fiscal_committee == 'Yes':
                    tags.append('fiscal committee')
                if version.local_program == 'Yes':
                    tags.append('local program')
                if version.urgency == 'Yes':
                    tags.append('urgency')
                if version.taxlevy == 'Yes':
                    tags.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note='summary')
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras['impact_clause'] = impact_clause
            fsbill.extras['tags'] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == 'Y',
                    entity_type='person',
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {'Assembly': 'lower',
                             'Senate': 'upper'}[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:
                    def replacer(matchobj):
                        if matchobj:
                            return {'Assembly': 'lower',
                                    'Senate': 'upper'}[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r'^(Assembly|Senate)', replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r'Com[s]?. on', action.action) and not matched_abbrs:
                    msg = 'Failed to extract committee abbr from %r.'
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ('Mapping contains no committee name for '
                                   'abbreviation %r. Action text was %r.')
                            args = (abbr, action.action)
                            raise KeyError(msg % args)

                    committees = filter(None, committees)
                    kwargs['committees'] = committees

                    code = re.search(r'C[SXZ]\d+', actor)
                    if code is not None:
                        code = code.group()
                        kwargs['actor_info'] = {'committee_code': code}

                    assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace('Coms. on ', '')
                        act_str = act_str.replace('Com. on ' + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith('.'):
                            act_str = act_str + '.'

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ['upper', 'lower', 'legislature']:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = 'legislature'

                if actor != action.actor:
                    actor_info = kwargs.get('actor_info', {})
                    actor_info['details'] = action.actor
                    kwargs['actor_info'] = actor_info

                # Add strings for related legislators, if any.
                rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+'
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs['legislators'] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(act_str, date.strftime('%Y-%m-%d'), chamber=actor,
                                           classification=kwargs['classification'])
                for committee in kwargs.get('committees', []):
                    action.add_related_entity(
                        committee, entity_type='organization')
                seen_actions.add((actor, act_str, date))

            for vote_num, vote in enumerate(bill.votes):
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                if not vote.location:
                    continue

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                if vote.motion:
                    motion = vote.motion.motion_text or ''
                else:
                    motion = ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ',
                                '', motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$',
                                '', motion)
                motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                                r'Urgency Clause$',
                                '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                # XXX this is responsible for all the CA 'committee' votes, not
                # sure if that's a feature or bug, so I'm leaving it as is...
                # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                # org = {
                # 'name': vote_location,
                # 'classification': vote_classification
                # }

                fsvote = VoteEvent(
                    motion_text=motion,
                    start_date=self._tz.localize(vote.vote_date_time),
                    result='pass' if result else 'fail',
                    classification=vtype,
                    # organization=org,
                    chamber=vote_chamber,
                    bill=fsbill,
                )
                fsvote.extras = {'threshold': vote.threshold}

                source_url = (
                    'http://leginfo.legislature.ca.gov/faces'
                    '/billVotesClient.xhtml?bill_id={}'
                ).format(fsbill.identifier)
                fsvote.add_source(source_url)
                fsvote.pupa_id = source_url + '#' + str(vote_num)

                rc = {'yes': [], 'no': [], 'other': []}
                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        rc['yes'].append(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        rc['no'].append(record.legislator_name)
                    else:
                        rc['other'].append(record.legislator_name)

                # Handle duplicate votes
                for key in rc.keys():
                    rc[key] = list(set(rc[key]))

                for key, voters in rc.items():
                    for voter in voters:
                        fsvote.vote(key, voter)
                    # Set counts by summed votes for accuracy
                    fsvote.set_count(key, len(voters))

                yield fsvote

            yield fsbill
            self.session.expire_all()
Пример #49
0
    def scrape(self):
        for leg_summary in self.legislation(
                created_after=datetime.datetime(2014, 1, 1)):
            leg_type = BILL_TYPES[leg_summary['Type']]

            bill = Bill(identifier=leg_summary['File\xa0#'],
                        title=leg_summary['Title'],
                        legislative_session=None,
                        classification=leg_type,
                        from_organization={"name": "New York City Council"})
            bill.add_source(leg_summary['url'])

            leg_details = self.legDetails(leg_summary['url'])
            history = self.history(leg_summary['url'])

            bill.add_title(leg_details['Name'],
                           note='created by administrative staff')

            if 'Summary' in leg_details:
                bill.add_abstract(leg_details['Summary'], note='')

            if leg_details['Law number']:
                bill.add_identifier(leg_details['Law number'],
                                    note='law number')

            for sponsorship in self._sponsors(leg_details.get('Sponsors', [])):
                sponsor, sponsorship_type, primary = sponsorship
                bill.add_sponsorship(sponsor,
                                     sponsorship_type,
                                     'person',
                                     primary,
                                     entity_id=_make_pseudo_id(name=sponsor))

            for attachment in leg_details.get('Attachments', []):
                bill.add_document_link(attachment['label'],
                                       attachment['url'],
                                       media_type="application/pdf")

            history = list(history)

            if history:
                earliest_action = min(
                    self.toTime(action['Date']) for action in history)

                bill.legislative_session = self.sessions(earliest_action)
            else:
                bill.legislative_session = str(self.SESSION_STARTS[0])

            for action in history:
                action_description = action['Action']
                if not action_description:
                    continue

                action_class = ACTION_CLASSIFICATION[action_description]

                action_date = self.toDate(action['Date'])
                responsible_org = action['Action\xa0By']
                if responsible_org == 'City Council':
                    responsible_org = 'New York City Council'
                elif responsible_org == 'Administration':
                    responsible_org = 'Mayor'

                if responsible_org == 'Town Hall Meeting':
                    continue
                else:
                    act = bill.add_action(
                        action_description,
                        action_date,
                        organization={'name': responsible_org},
                        classification=action_class)

                if 'url' in action['Action\xa0Details']:
                    action_detail_url = action['Action\xa0Details']['url']
                    if action_class == 'committee-referral':
                        action_details = self.actionDetails(action_detail_url)
                        referred_committee = action_details[
                            'Action text'].rsplit(' to the ', 1)[-1]
                        act.add_related_entity(
                            referred_committee,
                            'organization',
                            entity_id=_make_pseudo_id(name=referred_committee))
                    result, votes = self.extractVotes(action_detail_url)
                    if votes:
                        action_vote = VoteEvent(
                            legislative_session=bill.legislative_session,
                            motion_text=action_description,
                            organization={'name': responsible_org},
                            classification=action_class,
                            start_date=action_date,
                            result=result,
                            bill=bill)
                        action_vote.add_source(action_detail_url)

                        for option, voter in votes:
                            action_vote.vote(option, voter)

                        yield action_vote

            text = self.text(leg_summary['url'])

            if text:
                bill.extras = {
                    'local_classification': leg_summary['Type'],
                    'full_text': text
                }
            else:
                bill.extras = {'local_classification': leg_summary['Type']}

            yield bill
Пример #50
0
    def get_bill(self, matter):
        '''Make Bill object from given matter.'''
        
        '''
        Currently, NYC Legistar does not have conventional "Types" for 
        three newly added committees: https://legistar.council.nyc.gov/Departments.aspx
        We communicated the issue to NYC, and until we learn more, we will
        skip the bills attached to those committees.
        '''
        orgs_without_type = ['Charter Revision Commission 2019',
                             'New York City Advisory Commission on Property Tax Reform',
                             'Democratic Conference of the Council of the City of New York']
        if matter['MatterBodyName'].strip() in orgs_without_type:
            return None

        matter_id = matter['MatterId']
        if matter_id in DUPLICATED_ACTIONS:
            return None

        date = matter['MatterIntroDate']
        title = matter['MatterName']
        identifier = matter['MatterFile']

        if not all((date, title, identifier)):
            return None

        leg_type = BILL_TYPES[matter['MatterTypeName']]

        bill_session = self.sessions(self.toTime(date))

        bill = Bill(identifier=identifier,
                    title=title,
                    classification=leg_type,
                    legislative_session=bill_session,
                    from_organization={"name": "New York City Council"})

        legistar_web = matter['legistar_url']
        legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

        bill.add_source(legistar_web, note='web')
        bill.add_source(legistar_api, note='api')

        if matter['MatterTitle']:
            bill.add_title(matter['MatterTitle'])

        if matter['MatterEXText5']:
            bill.add_abstract(matter['MatterEXText5'], note='')

        try:
            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        for attachment in self.attachments(matter_id):

            if attachment['MatterAttachmentId'] == 103315:  # Duplicate
                return None

            if attachment['MatterAttachmentName']:
                bill.add_document_link(attachment['MatterAttachmentName'],
                                       attachment['MatterAttachmentHyperlink'],
                                       media_type='application/pdf')

        for topic in self.topics(matter_id) :
            bill.add_subject(topic['MatterIndexName'].strip())

        for relation in self.relations(matter_id):
            try:
                related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId'])
            except scrapelib.HTTPError:
                return None
            else:
                date = related_bill['MatterIntroDate']
                related_bill_session = self.session(self.toTime(date))
                identifier = related_bill['MatterFile']
                bill.add_related_bill(identifier=identifier,
                                      legislative_session=related_bill_session,
                                      relation_type='companion')

        try:
            text = self.text(matter_id)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        bill.extras['local_classification'] = matter['MatterTypeName']

        if text:
            if text['MatterTextPlain']:
                bill.extras['plain_text'] = text['MatterTextPlain'].replace(u'\u0000', '')

            if text['MatterTextRtf']:
                bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')

        return bill