def scrape_bill(self, row, chamber, session): bill_id = row['LegislationNumber'] # TODO: re-evaluate if these should be separate bills if 'SA' in bill_id or 'HA' in bill_id: self.warning('skipping amendment %s', bill_id) return bill_type = self.classify_bill(bill_id) bill = Bill(identifier=bill_id, legislative_session=session, chamber=chamber, title=row['LongTitle'], classification=bill_type) if row['Synopsis']: bill.add_abstract(row['Synopsis'], 'synopsis') if row['ShortTitle']: bill.add_title(row['ShortTitle'], 'short title') if row['SponsorPersonId']: self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary') # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API? html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format( row['LegislationId'] ) bill.add_source(html_url, note='text/html') html = self.lxmlize(html_url) # Additional Sponsors: '//label[text()="Additional Sponsor(s):"]/following-sibling::div/a' additional_sponsors = html.xpath('//label[text()="Additional Sponsor(s):"]' '/following-sibling::div/a/@href') for sponsor_url in additional_sponsors: sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary') # CoSponsors: '//label[text()="Co-Sponsor(s):"]/following-sibling::div/a' cosponsors = html.xpath('//label[text()="Additional Sponsor(s):"]/' 'following-sibling::div/a/@href') for sponsor_url in cosponsors: sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor') versions = html.xpath('//label[text()="Original Text:"]/following-sibling::div/a/@href') for version_url in versions: media_type = self.mime_from_link(version_url) version_name = 'Bill Text' # on_duplicate='error' bill.add_version_link(version_name, version_url, media_type=media_type) fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href') for fiscal in fiscals: self.scrape_fiscal_note(bill, fiscal) self.scrape_actions(bill, row['LegislationId']) yield from self.scrape_votes(bill, row['LegislationId'], session) yield bill
def scrape_chamber(self, chamber, session): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? bill_request = self.get(ksapi.url + 'bill_status/').text bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE'] # main bill = Bill( bill_id, session, title, chamber=chamber, classification=btype, ) bill.extras = {'status': bill_data['STATUS']} bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if (bill_data['LONGTITLE'] and bill_data['LONGTITLE'] != bill.title): bill.add_title(bill_data['LONGTITLE']) # An "original sponsor" is the API's expression of "primary sponsor" for primary_sponsor in bill_data['ORIGINAL_SPONSOR']: bill.add_sponsorship(name=primary_sponsor, entity_type='organization' if "committee" in primary_sponsor.lower() else 'person', primary=True, classification="original sponsor") for sponsor in bill_data['SPONSOR_NAMES']: if sponsor in bill_data['ORIGINAL_SPONSOR']: continue bill.add_sponsorship( name=sponsor, entity_type='organization' if "committee" in sponsor.lower() else 'person', primary=False, classification='cosponsor', ) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = event['session_date'] # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning( 'unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = None else: atype = ksapi.action_codes[event['action_code']] bill.add_action(action, date, chamber=actor, classification=atype) # Versions are exposed in `bill_data['versions'], # but lack any descriptive text or identifiers; # continue to scrape these from the HTML yield from self.scrape_html(bill, session) yield bill
def scrape_bill(self, row, session): bill_id = row["LegislationDisplayCode"] amendment = None substitute = None if bill_id.count(" ") > 1: if " w/ " in bill_id: self.info("Found amended bill `{}`".format(bill_id)) bill_id, amendment = bill_id.split(" w/ ") # A bill can _both_ be amended and be substituted if " for " in bill_id: self.info("Found substitute to use instead: `{}`".format(bill_id)) substitute, bill_id = bill_id.split(" for ") if amendment is None and substitute is None: raise ValueError("unknown bill_id format: " + bill_id) bill_type = self.classify_bill(bill_id) chamber = "upper" if bill_id.startswith("S") else "lower" bill = Bill( identifier=bill_id, legislative_session=session, chamber=chamber, title=row["LongTitle"], classification=bill_type, ) if row["Synopsis"]: bill.add_abstract(row["Synopsis"], "synopsis") if row["ShortTitle"]: bill.add_title(row["ShortTitle"], "short title") if row["SponsorPersonId"]: self.add_sponsor_by_legislator_id(bill, row["SponsorPersonId"], "primary") if substitute: bill.extras["substitute"] = substitute if amendment: bill.extras["amendment"] = amendment # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API? html_url = "https://legis.delaware.gov/BillDetail?LegislationId={}".format( row["LegislationId"] ) bill.add_source(html_url, note="text/html") html = self.lxmlize(html_url) additional_sponsors = html.xpath( '//label[text()="Additional Sponsor(s):"]' "/following-sibling::div/a/@href" ) for sponsor_url in additional_sponsors: sponsor_id = sponsor_url.replace( "https://legis.delaware.gov/LegislatorDetail?" "personId=", "" ) self.add_sponsor_by_legislator_id(bill, sponsor_id, "primary") cosponsors = html.xpath( '//label[text()="Co-Sponsor(s):"]/' "following-sibling::div/a/@href" ) for sponsor_url in cosponsors: sponsor_id = sponsor_url.replace( "https://legis.delaware.gov/LegislatorDetail?" "personId=", "" ) self.add_sponsor_by_legislator_id(bill, sponsor_id, "cosponsor") versions = html.xpath( '//label[text()="Original Text:"]/following-sibling::div/a/@href' ) for version_url in versions: media_type = self.mime_from_link(version_url) version_name = "Bill Text" bill.add_version_link(version_name, version_url, media_type=media_type) fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href') for fiscal in fiscals: self.scrape_fiscal_note(bill, fiscal) self.scrape_actions(bill, row["LegislationId"]) if row["HasAmendments"] is True: self.scrape_amendments(bill, row["LegislationId"]) yield from self.scrape_votes(bill, row["LegislationId"], session) yield bill
def scrape_bill(self, bill_num, session): chamber_map = {'House': 'lower', 'Senate': 'upper', 'LSO': 'executive'} # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45 bill_json_url = 'http://wyoleg.gov/LsoService/api/BillInformation/{}/' \ '{}?calendarDate='.format( session, bill_num) response = self.get(bill_json_url) bill_json = json.loads(response.content.decode('utf-8')) chamber = 'lower' if bill_json['bill'][0] else 'upper' bill = Bill(identifier=bill_json['bill'], legislative_session=session, title=bill_json['catchTitle'], chamber=chamber, classification="bill", ) bill.add_title(bill_json['billTitle']) source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format(session, bill_json['bill']) bill.add_source(source_url) for action_json in bill_json['billActions']: utc_action_date = self.parse_local_date(action_json['statusDate']) actor = None if action_json['location'] and action_json['location'] in chamber_map: actor = chamber_map[action_json['location']] action = bill.add_action( chamber=actor, description=action_json['statusMessage'], date=utc_action_date, classification=categorize_action(action_json['statusMessage']), ) action.extras = { 'billInformationID': action_json['billInformationID']} if bill_json['introduced']: url = 'http://wyoleg.gov/{}'.format(bill_json['introduced']) bill.add_version_link(note="Introduced", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['enrolledAct']: url = 'http://wyoleg.gov/{}'.format(bill_json['enrolledAct']) bill.add_version_link(note="Enrolled", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['fiscalNote']: url = 'http://wyoleg.gov/{}'.format(bill_json['fiscalNote']) bill.add_document_link(note="Fiscal Note", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['digest']: url = 'http://wyoleg.gov/{}'.format(bill_json['digest']) bill.add_document_link(note="Bill Digest", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['vetoes']: for veto in bill_json['vetoes']: url = 'http://wyoleg.gov/{}'.format(veto['vetoLinkPath']) bill.add_version_link(note=veto['vetoLinkText'], url=url, media_type="application/pdf" # optional but useful! ) for amendment in bill_json['amendments']: # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf url = 'http://wyoleg.gov/{}/Amends/{}.pdf'.format( session, amendment['amendmentNumber']) if amendment['sponsor'] and amendment['status']: title = 'Amendment {} ({}) - {} ({})'.format( amendment['amendmentNumber'], amendment['order'], amendment['sponsor'], amendment['status'], ) else: title = 'Amendment {} ({})'.format( amendment['amendmentNumber'], amendment['order'], ) # add versions of the bill text version = bill.add_version_link( note=title, url=url, media_type="application/pdf", ) version['extras'] = { 'amendmentNumber': amendment['amendmentNumber'], 'sponsor': amendment['sponsor'], } for sponsor in bill_json['sponsors']: status = 'primary' if sponsor['primarySponsor'] else 'cosponsor' sponsor_type = 'person' if sponsor['sponsorTitle'] else 'organization' bill.add_sponsorship( name=sponsor['name'], classification=status, entity_type=sponsor_type, primary=sponsor['primarySponsor'] ) if bill_json['summary']: bill.add_abstract( note="summary", abstract=bill_json['summary'], ) if bill_json['enrolledNumber']: bill.extras['wy_enrolled_number'] = bill_json['enrolledNumber'] if bill_json['chapter']: bill.extras['chapter'] = bill_json['chapter'] if bill_json['effectiveDate']: eff = datetime.datetime.strptime( bill_json['effectiveDate'], '%m/%d/%Y') bill.extras['effective_date'] = eff.strftime('%Y-%m-%d') bill.extras['wy_bill_id'] = bill_json['id'] for vote_json in bill_json['rollCalls']: yield from self.scrape_vote(bill, vote_json, session) yield bill
def scrape_bill(self, bill_num, session): chamber_map = {"House": "lower", "Senate": "upper", "LSO": "executive"} # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45 bill_json_url = ("http://wyoleg.gov/LsoService/api/BillInformation/{}/" "{}?calendarDate=".format(session, bill_num)) response = self.get(bill_json_url) bill_json = json.loads(response.content.decode("utf-8")) chamber = "lower" if bill_json["bill"][0] else "upper" bill = Bill( identifier=bill_json["bill"], legislative_session=session, title=bill_json["catchTitle"], chamber=chamber, classification="bill", ) bill.add_title(bill_json["billTitle"]) source_url = "http://lso.wyoleg.gov/Legislation/{}/{}".format( session, bill_json["bill"]) bill.add_source(source_url) for action_json in bill_json["billActions"]: utc_action_date = self.parse_local_date(action_json["statusDate"]) actor = None if action_json["location"] and action_json[ "location"] in chamber_map: actor = chamber_map[action_json["location"]] action = bill.add_action( chamber=actor, description=action_json["statusMessage"], date=utc_action_date, classification=categorize_action(action_json["statusMessage"]), ) action.extras = { "billInformationID": action_json["billInformationID"] } if bill_json["introduced"]: url = "http://wyoleg.gov/{}".format(bill_json["introduced"]) bill.add_version_link( note="Introduced", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["enrolledAct"]: url = "http://wyoleg.gov/{}".format(bill_json["enrolledAct"]) bill.add_version_link( note="Enrolled", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["fiscalNote"]: url = "http://wyoleg.gov/{}".format(bill_json["fiscalNote"]) bill.add_document_link( note="Fiscal Note", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["digest"]: url = "http://wyoleg.gov/{}".format(bill_json["digest"]) bill.add_document_link( note="Bill Digest", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["vetoes"]: for veto in bill_json["vetoes"]: url = "http://wyoleg.gov/{}".format(veto["vetoLinkPath"]) bill.add_version_link( note=veto["vetoLinkText"], url=url, media_type="application/pdf", # optional but useful! ) for amendment in bill_json["amendments"]: # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf url = "http://wyoleg.gov/{}/Amends/{}.pdf".format( session, amendment["amendmentNumber"]) if amendment["sponsor"] and amendment["status"]: title = "Amendment {} ({}) - {} ({})".format( amendment["amendmentNumber"], amendment["order"], amendment["sponsor"], amendment["status"], ) else: title = "Amendment {} ({})".format( amendment["amendmentNumber"], amendment["order"]) # add versions of the bill text version = bill.add_version_link(note=title, url=url, media_type="application/pdf") version["extras"] = { "amendmentNumber": amendment["amendmentNumber"], "sponsor": amendment["sponsor"], } for sponsor in bill_json["sponsors"]: status = "primary" if sponsor["primarySponsor"] else "cosponsor" sponsor_type = "person" if sponsor[ "sponsorTitle"] else "organization" bill.add_sponsorship( name=sponsor["name"], classification=status, entity_type=sponsor_type, primary=sponsor["primarySponsor"], ) if bill_json["summary"]: bill.add_abstract(note="summary", abstract=bill_json["summary"]) if bill_json["enrolledNumber"]: bill.extras["wy_enrolled_number"] = bill_json["enrolledNumber"] if bill_json["chapter"]: bill.extras["chapter"] = bill_json["chapter"] if bill_json["effectiveDate"]: eff = datetime.datetime.strptime(bill_json["effectiveDate"], "%m/%d/%Y") bill.extras["effective_date"] = eff.strftime("%Y-%m-%d") bill.extras["wy_bill_id"] = bill_json["id"] for vote_json in bill_json["rollCalls"]: yield from self.scrape_vote(bill, vote_json, session) yield bill
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title='', chamber=chamber) if ((bill_id.startswith('S') and chamber == 'lower') or (bill_id.startswith('A') and chamber == 'upper')): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type='text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize(version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime( '%m/%d/%y') version_name = "{} - {}".format( version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type='application/pdf', date=version_date.date()) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') tags = [] if version.fiscal_committee == 'Yes': tags.append('fiscal committee') if version.local_program == 'Yes': tags.append('local program') if version.urgency == 'Yes': tags.append('urgency') if version.taxlevy == 'Yes': tags.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note='summary') fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras['impact_clause'] = impact_clause fsbill.extras['tags'] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == 'Y', entity_type='person', ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = {'Assembly': 'lower', 'Senate': 'upper'}[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: def replacer(matchobj): if matchobj: return {'Assembly': 'lower', 'Senate': 'upper'}[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r'Com[s]?. on', action.action) and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Coms. on ', '') act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith('.'): act_str = act_str + '.' # Determine which chamber the action originated from. changed = False for committee_chamber in ['upper', 'lower', 'legislature']: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = 'legislature' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action(act_str, date.strftime('%Y-%m-%d'), chamber=actor, classification=kwargs['classification']) for committee in kwargs.get('committees', []): action.add_related_entity( committee, entity_type='organization') seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == '(PASS)': result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result='pass' if result else 'fail', classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {'threshold': vote.threshold} source_url = ( 'http://leginfo.legislature.ca.gov/faces' '/billVotesClient.xhtml?bill_id={}' ).format(fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + '#' + str(vote_num) rc = {'yes': [], 'no': [], 'other': []} for record in vote.votes: if record.vote_code == 'AYE': rc['yes'].append(record.legislator_name) elif record.vote_code.startswith('NO'): rc['no'].append(record.legislator_name) else: rc['other'].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()
def scrape(self, session=None, chambers=None): # Bills endpoint can sometimes take a very long time to load self.timeout = 300 if not session: session = self.latest_session() self.info("no session, using %s", session) if int(session) < 128: raise AssertionError("No data for period {}".format(session)) elif int(session) < 131: # they changed their data format starting in 131st and added # an undocumented API yield from self.old_scrape(session) else: chamber_dict = { "Senate": "upper", "House": "lower", "House of Representatives": "lower", "house": "lower", "senate": "upper", } # so presumanbly not everything passes, but we haven't # seen anything not pass yet, so we'll need to wait # till it fails and get the right language in here vote_results = { "approved": True, "passed": True, "adopted": True, "true": True, "false": False, "failed": False, True: True, False: False, } action_dict = { "ref_ctte_100": "referral-committee", "intro_100": "introduction", "intro_101": "introduction", "pass_300": "passage", "intro_110": "reading-1", "refer_210": "referral-committee", "crpt_301": None, "crpt_317": None, "concur_606": "passage", "pass_301": "passage", "refer_220": "referral-committee", "intro_102": ["introduction", "passage"], "intro_105": ["introduction", "passage"], "intro_ref_ctte_100": "referral-committee", "refer_209": None, "intro_108": ["introduction", "passage"], "intro_103": ["introduction", "passage"], "msg_reso_503": "passage", "intro_107": ["introduction", "passage"], "imm_consid_360": "passage", "refer_213": None, "adopt_reso_100": "passage", "adopt_reso_110": "passage", "msg_507": "amendment-passage", "confer_713": None, "concur_603": None, "confer_712": None, "msg_506": "amendment-failure", "receive_message_100": "passage", "motion_920": None, "concur_611": None, "confer_735": None, "third_429": None, "final_501": None, "concur_608": None, } base_url = "http://search-prod.lis.state.oh.us" first_page = base_url first_page += "/solarapi/v1/general_assembly_{session}/".format( session=session) legislators = self.get_legislator_ids(first_page) all_amendments = self.get_other_data_source( first_page, base_url, "amendments") all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals") all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss") all_analysis = self.get_other_data_source(first_page, base_url, "analysiss") for row in self.get_bill_rows(session): ( spacer, number_link, _ga, title, primary_sponsor, status, spacer, ) = row.xpath("td") # S.R.No.1 -> SR1 bill_id = number_link.text_content().replace("No.", "") bill_id = bill_id.replace(".", "").replace(" ", "") # put one space back in between type and number bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id) title = title.text_content().strip() title = re.sub(r"^Title", "", title) chamber = "lower" if "H" in bill_id else "upper" classification = "bill" if "B" in bill_id else "resolution" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification, ) bill.add_source(number_link.xpath("a/@href")[0]) # get bill from API bill_api_url = ( "http://search-prod.lis.state.oh.us/solarapi/v1/" "general_assembly_{}/{}/{}/".format( session, "bills" if "B" in bill_id else "resolutions", bill_id.lower().replace(" ", ""), )) data = self.get(bill_api_url).json() # add title if no short title if not bill.title: bill.title = data["items"][0]["longtitle"] bill.add_title(data["items"][0]["longtitle"], "long title") # this stuff is version-specific for version in data["items"]: version_name = version["version"] version_link = base_url + version["pdfDownloadLink"] bill.add_version_link(version_name, version_link, media_type="application/pdf") # we'll use latest bill_version for everything else bill_version = data["items"][0] bill.add_source(bill_api_url) # subjects for subj in bill_version["subjectindexes"]: try: bill.add_subject(subj["primary"]) except KeyError: pass try: secondary_subj = subj["secondary"] except KeyError: secondary_subj = "" if secondary_subj: bill.add_subject(secondary_subj) # sponsors sponsors = bill_version["sponsors"] for sponsor in sponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification="primary", entity_type="person", primary=True, ) cosponsors = bill_version["cosponsors"] for sponsor in cosponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification="cosponsor", entity_type="person", primary=False, ) try: action_doc = self.get(base_url + bill_version["action"][0]["link"]) except scrapelib.HTTPError: pass else: actions = action_doc.json() for action in reversed(actions["items"]): actor = chamber_dict[action["chamber"]] action_desc = action["description"] try: action_type = action_dict[action["actioncode"]] except KeyError: self.warning( "Unknown action {desc} with code {code}." " Add it to the action_dict" ".".format(desc=action_desc, code=action["actioncode"])) action_type = None date = self._tz.localize( datetime.datetime.strptime(action["datetime"], "%Y-%m-%dT%H:%M:%S")) date = "{:%Y-%m-%d}".format(date) bill.add_action(action_desc, date, chamber=actor, classification=action_type) # attach documents gathered earlier self.add_document(all_amendments, bill_id, "amendment", bill, base_url) self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url) self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url) self.add_document(all_analysis, bill_id, "analysis", bill, base_url) # votes vote_url = base_url + bill_version["votes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning( "Vote page not loading; skipping: {}".format(vote_url)) continue votes = vote_doc.json() yield from self.process_vote( votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results, ) vote_url = base_url vote_url += bill_version["cmtevotes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning( "Vote page not loading; skipping: {}".format(vote_url)) continue votes = vote_doc.json() yield from self.process_vote( votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results, ) if data["items"][0]["effective_date"]: effective_date = datetime.datetime.strptime( data["items"][0]["effective_date"], "%Y-%m-%d") effective_date = self._tz.localize(effective_date) # the OH website adds an action that isn't in the action list JSON. # It looks like: # Effective 7/6/18 effective_date_oh = "{:%-m/%-d/%y}".format(effective_date) effective_action = "Effective {}".format(effective_date_oh) bill.add_action( effective_action, effective_date, chamber="executive", classification=["became-law"], ) # we have never seen a veto or a disapprove, but they seem important. # so we'll check and throw an error if we find one # life is fragile. so are our scrapers. if "veto" in bill_version: veto_url = base_url + bill_version["veto"][0]["link"] veto_json = self.get(veto_url).json() if len(veto_json["items"]) > 0: raise AssertionError("Whoa, a veto! We've never" " gotten one before." " Go write some code to deal" " with it: {}".format(veto_url)) if "disapprove" in bill_version: disapprove_url = base_url + bill_version["disapprove"][0][ "link"] disapprove_json = self.get(disapprove_url).json() if len(disapprove_json["items"]) > 0: raise AssertionError( "Whoa, a disapprove! We've never" " gotten one before." " Go write some code to deal " "with it: {}".format(disapprove_url)) yield bill
def test_full_bill(): create_jurisdiction() sp = ScrapePerson('Adam Smith') org = ScrapeOrganization(name='House', classification='lower') com = ScrapeOrganization(name='Arbitrary Committee', classification='committee', parent_id=org._id) oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill.subject = ['taxes', 'axes'] bill.add_identifier('SB 9') bill.add_title('Tack & Axe Tax Act') bill.add_action('introduced in house', '1900-04-01', chamber='lower') act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower') act.add_related_entity('arbitrary committee', 'organization', com._id) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person', primary=False, entity_id=sp._id) bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person', primary=True) bill.add_abstract('This is an act about axes and taxes and tacks.', note="official", date='1969-10-20') bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf', media_type='application/pdf') bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html') bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html') bill.add_source('http://example.com/source') # import bill oi = OrganizationImporter('jid') oi.import_data([org.as_dict(), com.as_dict()]) pi = PersonImporter('jid') pi.import_data([sp.as_dict()]) BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier='HB 1') assert b.from_organization.classification == 'lower' assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ['taxes', 'axes'] assert b.abstracts.get().note == 'official' assert b.abstracts.get().date == '1969-10-20' # other_title, other_identifier added assert b.other_titles.get().title == 'Tack & Axe Tax Act' assert b.other_identifiers.get().identifier == 'SB 9' # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get( classification='lower') assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert (actions[1].related_entities.get().organization == Organization.objects.get(classification='committee')) # related_bills were added rb = b.related_bills.get() assert rb.identifier == 'HB 99' # and bill got resolved assert rb.related_bill.identifier == 'HB 99' # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 person = Person.objects.get(name='Adam Smith') for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def _parse_house_bill(self, url, session): # using the print page makes the page simpler, and also *drastically* smaller # (8k rather than 100k) url = re.sub("billsummary", "billsummaryprn", url) url = '%s/%s' % (self._house_base_url, url) # the URL is an iframed version now, so swap in for the actual bill page url = url.replace('Bill.aspx', 'BillContent.aspx') url = url.replace('&code=R', '&code=R&style=new') # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new bill_page = self.get(url).text bill_page = lxml.html.fromstring(bill_page) bill_page.make_links_absolute(url) bill_id = bill_page.xpath('//*[@class="entry-title"]/div') if len(bill_id) == 0: self.info("WARNING: bill summary page is blank! (%s)" % url) self._bad_urls.append(url) return bill_id = bill_id[0].text_content() bill_id = clean_text(bill_id) bill_desc = bill_page.xpath('//*[@class="BillDescription"]')[0].text_content() bill_desc = clean_text(bill_desc) table_rows = bill_page.xpath('//table/tr') # if there is a cosponsor all the rows are pushed down one for the extra row # for the cosponsor: cosponsorOffset = 0 if table_rows[2][0].text_content().strip() == 'Co-Sponsor:': cosponsorOffset = 1 lr_label_tag = table_rows[3 + cosponsorOffset] assert lr_label_tag[0].text_content().strip() == 'LR Number:' # bill_lr = lr_label_tag[1].text_content() lastActionOffset = 0 if table_rows[4 + cosponsorOffset][0].text_content().strip() == 'Governor Action:': lastActionOffset = 1 official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset] assert official_title_tag[0].text_content().strip() == 'Bill String:' official_title = official_title_tag[1].text_content() # could substitute the description for the name, # but keeping it separate for now. bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] bill_number = int(bill_id[3:].strip()) else: bill_number = int(bill_id[3:]) subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bill_desc == "": if bill_number <= 20: # blank bill titles early in session are approp. bills bill_desc = 'Appropriations Bill' else: self.error("Blank title. Skipping. {} / {} / {}".format( bill_id, bill_desc, official_title )) return bill = Bill( bill_id, chamber='lower', title=bill_desc, legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_title(official_title, note='official') bill.add_source(url) bill_sponsor = clean_text(table_rows[0][1].text_content()) # try: # bill_sponsor_link = table_rows[0][1][0].attrib['href'] # except IndexError: # return bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # check for cosponsors sponsors_url, = bill_page.xpath( "//a[contains(@href, 'CoSponsors.aspx')]/@href") self._parse_cosponsors_from_bill(bill, sponsors_url) # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0] # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href']) # actions_link = re.sub("content", "print", actions_link) actions_link, = bill_page.xpath( "//a[contains(@href, 'BillActions.aspx')]/@href") yield from self._parse_house_actions(bill, actions_link) # get bill versions doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span') for doc_tag in reversed(doc_tags): doc = clean_text(doc_tag.text_content()) text_url = '%s%s' % ( self._house_base_url, doc_tag[0].attrib['href'] ) bill.add_document_link(doc, text_url, media_type='text/html') # get bill versions version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span') for version_tag in reversed(version_tags): version = clean_text(version_tag.text_content()) for vurl in version_tag.xpath(".//a"): if vurl.text == 'PDF': mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(version, vurl.attrib['href'], media_type=mimetype, on_duplicate='ignore') # house bill versions # everything between the row containing "Bill Text"" and the next div.DocHeaderRow version_rows = bill_page.xpath( '//div[contains(text(),"Bill Text")]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]') for row in version_rows: # some rows are just broken links, not real versions if row.xpath('.//div[contains(@class,"textType")]/a/@href'): version = row.xpath('.//div[contains(@class,"textType")]/a/text()')[0].strip() path = row.xpath('.//div[contains(@class,"textType")]/a/@href')[0].strip() if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(version, path, media_type=mimetype, on_duplicate='ignore') # house bill summaries # everything between the row containing "Bill Summary"" and the next div.DocHeaderRow summary_rows = bill_page.xpath( '//div[contains(text(),"Bill Summary")]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]') # if there are no amedments, we need a different xpath for summaries if not summary_rows: summary_rows = bill_page.xpath( '//div[contains(text(),"Bill Summary")]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(summary_rows): version = row.xpath('.//div[contains(@class,"textType")]/a/text()')[0].strip() if version: path = row.xpath('.//div[contains(@class,"textType")]/a/@href')[0].strip() summary_name = 'Bill Summary ({})'.format(version) if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_document_link(summary_name, path, media_type=mimetype, on_duplicate='ignore') # house bill amendments amendment_rows = bill_page.xpath('//div[contains(text(),"Amendment")]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(amendment_rows): version = row.xpath('.//div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip() path = row.xpath('.//div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip() summary_name = 'Amendment {}'.format(version) defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]') if defeated_icon: summary_name = '{} (Defeated)'.format(summary_name) adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]') if adopted_icon: summary_name = '{} (Adopted)'.format(summary_name) distributed_icon = row.xpath('.//img[contains(@title,"Distributed")]') if distributed_icon: summary_name = '{} (Distributed)'.format(summary_name) if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(summary_name, path, media_type=mimetype, on_duplicate='ignore') yield bill
def _parse_senate_billpage(self, bill_url, year): bill_page = self.lxmlize(bill_url) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content() # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bid == 'XXXXXX': self.info("Skipping Junk Bill") return bill = Bill( bill_id, title=bill_desc, chamber='upper', legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_abstract(bill_desc, note='abstract') bill.add_source(bill_url) if bill_title: bill.add_title(bill_title) # Get the primary sponsor sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() # bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'): self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//a[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] self._parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//a[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.get('href'): self._parse_senate_bill_versions(bill, versions_url[0].attrib['href']) amendment_links = bill_page.xpath('//a[contains(@href,"ShowAmendment.asp")]') for link in amendment_links: link_text = link.xpath('string(.)').strip() if 'adopted' in link_text.lower(): link_url = link.xpath('@href')[0] bill.add_version_link(link_text, link_url, media_type='application/pdf', on_duplicate='ignore') yield bill
def _scrape_bills(self): """ Does the following 1) Scrapes bill data from unitedstates project and saves the data to path specified in UnitedStates module 2) Iterates over bill data and converts each one to an OCD-compliant bill model. 3) Yields the OCD-compliant bill model instance @return: generator for federal US bills in OCD-compliant format @rtype: generator """ # run scraper first to pull in all the bill data self._run_unitedstates_bill_scraper() # iterate over all the files and build and yield Bill objects for filename in find_files(settings.SCRAPED_DATA_DIR, '.*/data/[0-9]+/bills/[^\/]+/[^\/]+/data.json'): try: with open(filename) as json_file: json_data = json.load(json_file) # Initialize Object bill = Bill(constants.TYPE_MAP[json_data['bill_type']]['canonical'] + ' ' + json_data['number'], json_data['congress'], json_data['official_title'], chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'] ) # add source of data bill.add_source(json_data['url'], note='all') # add subjects for subject in json_data['subjects']: bill.add_subject(subject) # add summary if 'summary' in json_data and json_data['summary'] is not None: bill.add_abstract(json_data['summary']['text'], json_data['summary']['as'], json_data['summary']['date']) # add titles for item in json_data['titles']: bill.add_title(item['title'], item['type']) # add other/related Bills for b in json_data['related_bills']: if 'type' in b and b['type'] == 'bill': split = b['bill_id'].split('-') m = UnitedStatesBillScraper.BILL_SPLIT.match(split[0]) bill.add_related_bill(constants.TYPE_MAP[m.group(1)]['canonical'] + ' ' + m.group(2), legislative_session=split[1], relation_type='companion') # add sponsor bill.add_sponsorship_by_identifier(json_data['sponsor']['name'], 'person', 'person', True, scheme='thomas_id', identifier=json_data['sponsor']['thomas_id'], chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber']) # add cosponsors for cs in json_data['cosponsors']: bill.add_sponsorship_by_identifier(cs['name'], 'person', 'person', False, scheme='thomas_id', identifier=cs['thomas_id'], chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber']) # add introduced_at and actions bill.add_action('date of introduction', datetime_to_date(json_data['introduced_at']), chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'], related_entities=[]) # add other actions for action in json_data['actions']: bill.actions.append({'date': datetime_to_date(action['acted_at']), 'type': [action['type']], 'description': action['text'], 'actor': constants.TYPE_MAP[json_data['bill_type']]['chamber'], 'related_entities': [] }) # add bill versions for version_path in find_files(os.path.join(settings.SCRAPED_DATA_DIR, 'data', bill.legislative_session, 'bills', json_data['bill_type'], json_data['bill_type'] + json_data['number'], 'text-versions'), '/.*/*\.json'): try: with open(version_path) as version_file: version_json_data = json.load(version_file) for k, v in version_json_data['urls'].items(): bill.versions.append({'date': datetime_to_date(version_json_data['issued_on']), 'type': version_json_data['version_code'], 'name': constants.VERSION_MAP[version_json_data['version_code']], 'links': [{'mimetype': k, 'url': v}]}) except IOError: print("Unable to open or parse file with path " + version_path) continue # finally yield bill object yield bill except IOError: print("Unable to open file with path " + filename) print(traceback.format_exc()) continue except KeyError: print("Unable to parse file with path " + filename) print(traceback.format_exc()) continue except: print('Unknown error with ' + filename) print(traceback.format_exc()) continue
def test_full_bill(): create_jurisdiction() person = Person.objects.create(id='person-id', name='Adam Smith') org = ScrapeOrganization(name='House', classification='lower') com = ScrapeOrganization(name='Arbitrary Committee', classification='committee', parent_id=org._id) oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill.subject = ['taxes', 'axes'] bill.add_identifier('SB 9') bill.add_title('Tack & Axe Tax Act') bill.add_action('introduced in house', '1900-04-01', chamber='lower') act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower') act.add_related_entity('arbitrary committee', 'organization', com._id) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person', primary=False, entity_id=person.id) bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person', primary=True) bill.add_abstract('This is an act about axes and taxes and tacks.', note="official") bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf', media_type='application/pdf') bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html') bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html') bill.add_source('http://example.com/source') # import bill oi = OrganizationImporter('jid') oi.import_data([org.as_dict(), com.as_dict()]) pi = PersonImporter('jid') pi.json_to_db_id['person-id'] = 'person-id' # Since we have to create this person behind the back of the import # transaction, we'll fake the json-id to db-id, since they match in this # case. This is *really* getting at some implementation detail, but it's # the cleanest way to ensure we short-circut the json id lookup. BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier='HB 1') assert b.from_organization.classification == 'lower' assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ['taxes', 'axes'] assert b.abstracts.get().note == 'official' # other_title, other_identifier added assert b.other_titles.get().title == 'Tack & Axe Tax Act' assert b.other_identifiers.get().identifier == 'SB 9' # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get(classification='lower') assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert (actions[1].related_entities.get().organization == Organization.objects.get(classification='committee')) # related_bills were added rb = b.related_bills.get() assert rb.identifier == 'HB 99' # and bill got resolved assert rb.related_bill.identifier == 'HB 99' # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def _parse_senate_billpage(self, bill_url, year): bill_page = self.lxmlize(bill_url) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content() # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) bill = Bill( bill_id, title=bill_desc, legislative_session=year, classification=bill_type, ) bill.subject = subs bill.add_abstract(bill_desc, note='abstract') bill.add_source(bill_url) if bill_title: bill.add_title(bill_title) # Get the primary sponsor sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() # bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'): self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//a[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] self._parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//a[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.get('href'): self._parse_senate_bill_versions(bill, versions_url[0].attrib['href']) yield bill
def scrape_bill(self, row, session): bill_id = row['LegislationDisplayCode'] amendment = None substitute = None if bill_id.count(' ') > 1: if ' w/ ' in bill_id: self.info('Found amended bill `{}`'.format(bill_id)) bill_id, amendment = bill_id.split(' w/ ') # A bill can _both_ be amended and be substituted if ' for ' in bill_id: self.info("Found substitute to use instead: `{}`".format(bill_id)) substitute, bill_id = bill_id.split(' for ') if amendment is None and substitute is None: raise ValueError('unknown bill_id format: ' + bill_id) bill_type = self.classify_bill(bill_id) chamber = 'upper' if bill_id.startswith('S') else 'lower' bill = Bill(identifier=bill_id, legislative_session=session, chamber=chamber, title=row['LongTitle'], classification=bill_type) if row['Synopsis']: bill.add_abstract(row['Synopsis'], 'synopsis') if row['ShortTitle']: bill.add_title(row['ShortTitle'], 'short title') if row['SponsorPersonId']: self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary') if substitute: bill.extras['substitute'] = substitute if amendment: bill.extras['amendment'] = amendment # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API? html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format( row['LegislationId'] ) bill.add_source(html_url, note='text/html') html = self.lxmlize(html_url) additional_sponsors = html.xpath('//label[text()="Additional Sponsor(s):"]' '/following-sibling::div/a/@href') for sponsor_url in additional_sponsors: sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary') cosponsors = html.xpath('//label[text()="Co-Sponsor(s):"]/' 'following-sibling::div/a/@href') for sponsor_url in cosponsors: sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor') versions = html.xpath('//label[text()="Original Text:"]/following-sibling::div/a/@href') for version_url in versions: media_type = self.mime_from_link(version_url) version_name = 'Bill Text' bill.add_version_link(version_name, version_url, media_type=media_type) fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href') for fiscal in fiscals: self.scrape_fiscal_note(bill, fiscal) self.scrape_actions(bill, row['LegislationId']) if row['HasAmendments'] is True: self.scrape_amendments(bill, row['LegislationId']) yield from self.scrape_votes(bill, row['LegislationId'], session) yield bill
def scrape(self, session=None): if not session: session = self.latest_session() self.info('no session specified, using %s', session) # chambers = [chamber] if chamber else ['upper','lower'] # chambers = [chamber] # if chamber else ['upper','lower'] #for chamber in chambers: # chambers = [chamber] # yield from self.scrape_chamber(session) #get member id matching for vote parsing member_ids = self.get_member_ids()[session] per_page = 10 #seems like it gives me 10 no matter what. start_record = 0 headers = {"Content-Type": "application/json"} url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicAdvancedSearch" bill_url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicData" params = { "request": { "sEcho": 2, "iColumns": 4, "sColumns": "", "iDisplayStart": 0, "iDisplayLength": per_page, "mDataProp_0": "ShortTitle", "mDataProp_1": "Title", "mDataProp_2": "LegislationCategories", "mDataProp_3": "Modified", "iSortCol_0": 0, "sSortDir_0": "asc", "iSortingCols": 0, "bSortable_0": "true", "bSortable_1": "true", "bSortable_2": "true", "bSortable_3": "true" }, "criteria": { "Keyword": "", "Category": "", "SubCategoryId": "", "RequestOf": "", "CouncilPeriod": str(session), "Introducer": "", "CoSponsor": "", "CommitteeReferral": "", "CommitteeReferralComments": "", "StartDate": "", "EndDate": "", "QueryLimit": 100, "FilterType": "", "Phases": "", "LegislationStatus": "0", "IncludeDocumentSearch": "false" } } param_json = json.dumps(params) response = self.post(url, headers=headers, data=param_json) #the response is a terrible string-of-nested-json-strings. Yuck. response = decode_json(response.json()["d"]) data = response["aaData"] global bill_versions while len(data) > 0: for bill in data: bill_versions = [ ] #sometimes they're in there more than once, so we'll keep track bill_id = bill["Title"] if bill_id.startswith("AG"): #actually an agenda, skip continue bill_params = {"legislationId": bill_id} bill_info = self.post(bill_url, headers=headers, data=json.dumps(bill_params)) bill_info = decode_json(bill_info.json()["d"])["data"] bill_source_url = "http://lims.dccouncil.us/Legislation/" + bill_id legislation_info = bill_info["Legislation"][0] title = legislation_info["ShortTitle"] if bill_id.startswith("R") or bill_id.startswith("CER"): bill_type = "resolution" else: bill_type = "bill" #dc has no chambers. calling it all upper # bill = Bill(session,"upper", bill_id, title, type=bill_type) # bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill = Bill(bill_id, legislative_session=session, title=title, classification=bill_type) #sponsors and cosponsors if "Introducer" in legislation_info: introducers = legislation_info["Introducer"] intro_date = self.date_format( legislation_info["IntroductionDate"]) # bill.add_action("upper", # "Introduced", # intro_date, # type="introduction") bill.add_action("Introduced", intro_date, chamber="upper", classification="introduction") else: #sometimes there are introducers, sometimes not. # Set Introducers to empty array to avoid downstream breakage, but log bills without introducers self.logger.warning("No Introducer: {0} {1}: {2}".format( bill['chamber'], bill['session'], bill['bill_id'])) introducers = [] try: #sometimes there are cosponsors, sometimes not. cosponsors = legislation_info["CoSponsor"] except KeyError: cosponsors = [] for i in introducers: name = i["Name"] #they messed up Phil Mendelson's name if name == "Phil Pmendelson": name = "Phil Mendelson" # bill.add_sponsor(name=sponsor_name,type="primary") bill.add_sponsorship(name, classification=cosponsors, entity_type='person', primary=True) for s in cosponsors: name = s["Name"] if name == "Phil Pmendelson": name = "Phil Mendelson" bill.add_sponsor(name=name, type="cosponsor") #if it's become law, add the law number as an alternate title if "LawNumber" in legislation_info: law_num = legislation_info["LawNumber"] if law_num: bill.add_title(law_num) #also sometimes it's got an act number if "ActNumber" in legislation_info: act_num = legislation_info["ActNumber"] if act_num: bill.add_title(act_num) #sometimes AdditionalInformation has a previous bill name if "AdditionalInformation" in legislation_info: add_info = legislation_info["AdditionalInformation"] if "previously" in add_info.lower(): prev_title = add_info.lower().replace( "previously", "").strip().replace(" ", "") bill.add_title(prev_title.upper()) elif add_info: bill["additional_information"] = add_info if "WithDrawnDate" in legislation_info: withdrawn_date = self.date_format( legislation_info["WithDrawnDate"]) withdrawn_by = legislation_info["WithdrawnBy"][0][ "Name"].strip() if withdrawn_by == "the Mayor": bill.add_action("withdrawn", withdrawn_date, chamber="executive", classification="withdrawal") elif "committee" in withdrawn_by.lower(): bill.add_action("withdrawn", withdrawn_date, chamber="upper", classification="withdrawal", committees=withdrawn_by) else: bill.add_action("withdrawn", withdrawn_date, chamber="upper", classification="withdrawal", legislators=withdrawn_by) #deal with actions involving the mayor mayor = bill_info["MayorReview"] if mayor != []: mayor = mayor[0] #in dc, mayor == governor because openstates schema if "TransmittedDate" in mayor: transmitted_date = self.date_format( mayor["TransmittedDate"]) bill.add_action("transmitted to mayor", transmitted_date, chamber="executive", classification="executive-receipt") if 'SignedDate' in mayor: signed_date = self.date_format(mayor["SignedDate"]) bill.add_action("signed", signed_date, chamber="executive", classification="executive-signature") elif 'ReturnedDate' in mayor: #if returned but not signed, it was vetoed veto_date = self.date_format(mayor["ReturnedDate"]) bill.add_action("vetoed", veto_date, chamber="executive", classification="executive-veto") if 'EnactedDate' in mayor: #if it was returned and enacted but not signed, there was a veto override override_date = self.date_format( mayor["EnactedDate"]) bill.add_action( "veto override", override_date, chamber="upper", classification="veto-override-passage") if 'AttachmentPath' in mayor: #documents relating to the mayor's review self.add_documents(mayor["AttachmentPath"], bill) congress = bill_info["CongressReview"] if len(congress) > 0: congress = congress[0] if "TransmittedDate" in congress: transmitted_date = self.date_format( congress["TransmittedDate"]) bill.add_action("Transmitted to Congress for review", transmitted_date, chamber="other") #deal with committee actions if "DateRead" in legislation_info: date = legislation_info["DateRead"] elif "IntroductionDate" in legislation_info: date = legislation_info["IntroductionDate"] else: self.logger.warning( "Crap, we can't find anything that looks like an action date. Skipping" ) continue date = self.date_format(date) if "CommitteeReferral" in legislation_info: committees = [] for committee in legislation_info["CommitteeReferral"]: if committee["Name"].lower( ) == "retained by the council": committees = [] break else: committees.append(committee["Name"]) if committees != []: bill.add_action("referred to committee", date, chamber="upper", committees=committees, classification="referral-committee") if "CommitteeReferralComments" in legislation_info: committees = [] for committee in legislation_info[ "CommitteeReferralComments"]: committees.append(committee["Name"]) bill.add_action("comments from committee", date, chamber="upper", committees=committees, classification="other") #deal with random docs floating around docs = bill_info["OtherDocuments"] for d in docs: if "AttachmentPath" in d: self.add_documents(d["AttachmentPath"], bill) else: self.logger.warning( "Document path missing from 'Other Documents'") if "MemoLink" in legislation_info: self.add_documents(legislation_info["MemoLink"], bill) if "AttachmentPath" in legislation_info: self.add_documents(legislation_info["AttachmentPath"], bill) #full council votes votes = bill_info["VotingSummary"] for vote in votes: self.process_vote(vote, bill, member_ids) #deal with committee votes if "CommitteeMarkup" in bill_info: committee_info = bill_info["CommitteeMarkup"] if len(committee_info) > 0: for committee_action in committee_info: self.process_committee_vote(committee_action, bill) if "AttachmentPath" in committee_info: self.add_documents(vote["AttachmentPath"], bill, is_version) bill.add_source(bill_source_url) self.save_bill(bill) #get next page start_record += per_page params["request"]["iDisplayStart"] = start_record param_json = json.dumps(params) response = self.post(url, headers=headers, data=param_json) response = decode_json(response.json()["d"]) data = response["aaData"]
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(' ', '')) bill_page = self.get(url, verify=False).text html = lxml.html.fromstring(bill_page) html.make_links_absolute('http://legislature.idaho.gov/legislation/%s/' % session) bill_tables = html.xpath('//table[contains(@class, "bill-table")]') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill(legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type) bill.add_source(url) for subject in self._subjects[bill_id.replace(' ', '')]: bill.add_subject(subject) if short_title and title.lower() != short_title.lower(): bill.add_title(short_title, 'short title') # documents doc_links = html.xpath('//div[contains(@class,"pf-content")]//a') for link in doc_links: name = link.text_content().strip() href = link.get('href') if 'Engrossment' in name or 'Bill Text' in name: bill.add_version_link(note=name, url=href, media_type="application/pdf") else: bill.add_document_link(note=name, url=href, media_type="application/pdf") def _split(string): return re.split(r"\w+[,|AND]\s+", string) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split('by') if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: if 'COMMITTEE' in sponsors.upper(): bill.add_sponsorship(name=sponsors.strip(), entity_type="organization", primary=True, classification='primary') else: for person in _split(sponsors): person = person.strip() if person != "": bill.add_sponsorship(classification='primary', name=person, entity_type="person", primary=True) actor = chamber last_date = None for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + '/' + session[0:4], "%m/%d/%Y").strftime('%Y-%m-%d') if action.startswith('House'): actor = 'lower' elif action.startswith('Senate'): actor = 'upper' # votes if 'AYES' in action or 'NAYS' in action: yield from self.parse_vote(actor, date, row[2], session, bill_id, chamber, url) # bill.add_vote_event(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u'\xa0', ' ').strip() atype = get_action(actor, action) bill.add_action(action, date, chamber=actor, classification=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if 'to House' in action: actor = 'lower' elif 'to Senate' in action: actor = 'upper' yield bill
def scrape_bill(self, row, session): bill_id = row['LegislationDisplayCode'] amendment = None substitute = None if bill_id.count(' ') > 1: if ' w/ ' in bill_id: self.info('Found amended bill `{}`'.format(bill_id)) bill_id, amendment = bill_id.split(' w/ ') # A bill can _both_ be amended and be substituted if ' for ' in bill_id: self.info("Found substitute to use instead: `{}`".format(bill_id)) substitute, bill_id = bill_id.split(' for ') if amendment is None and substitute is None: raise ValueError('unknown bill_id format: ' + bill_id) bill_type = self.classify_bill(bill_id) chamber = 'upper' if bill_id.startswith('S') else 'lower' bill = Bill(identifier=bill_id, legislative_session=session, chamber=chamber, title=row['LongTitle'], classification=bill_type) if row['Synopsis']: bill.add_abstract(row['Synopsis'], 'synopsis') if row['ShortTitle']: bill.add_title(row['ShortTitle'], 'short title') if row['SponsorPersonId']: self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary') if substitute: bill.extras['substitute'] = substitute if amendment: bill.extras['amendment'] = amendment # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API? html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format( row['LegislationId'] ) bill.add_source(html_url, note='text/html') html = self.lxmlize(html_url) additional_sponsors = html.xpath('//label[text()="Additional Sponsor(s):"]' '/following-sibling::div/a/@href') for sponsor_url in additional_sponsors: sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary') cosponsors = html.xpath('//label[text()="Co-Sponsor(s):"]/' 'following-sibling::div/a/@href') for sponsor_url in cosponsors: sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor') versions = html.xpath('//label[text()="Original Text:"]/following-sibling::div/a/@href') for version_url in versions: media_type = self.mime_from_link(version_url) version_name = 'Bill Text' bill.add_version_link(version_name, version_url, media_type=media_type) fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href') for fiscal in fiscals: self.scrape_fiscal_note(bill, fiscal) self.scrape_actions(bill, row['LegislationId']) yield from self.scrape_votes(bill, row['LegislationId'], session) yield bill
def scrape_chamber(self, chamber, session): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? bill_request = self.get(ksapi.url + 'bill_status/').text bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE'] # main bill = Bill( bill_id, session, title, chamber=chamber, classification=btype, ) bill.extras = {'status': bill_data['STATUS']} bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if (bill_data['LONGTITLE'] and bill_data['LONGTITLE'] != bill.title): bill.add_title(bill_data['LONGTITLE']) for sponsor in bill_data['SPONSOR_NAMES']: stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else 'cosponsor') if sponsor: bill.add_sponsorship( name=sponsor, entity_type='person', primary=stype == 'primary', classification=stype, ) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S") # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning('unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = None else: atype = ksapi.action_codes[event['action_code']] bill.add_action( action, date.strftime('%Y-%m-%d'), chamber=actor, classification=atype) try: yield from self.scrape_html(bill, session) except scrapelib.HTTPError as e: self.warning('unable to fetch HTML for bill {0}'.format( bill['bill_id'])) yield bill
def scrape_bill_type( self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex(), ): bills = (self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr)) archive_year = int(session[0:4]) not_archive_year = archive_year >= 2009 for bill in bills: bill_session = session if bill.session_num != "0": bill_session += " Special Session %s" % bill.session_num bill_id = bill.short_bill_id if bill_id.strip() == "SB77" and session == "20052006": continue fsbill = Bill(bill_id, bill_session, title="", chamber=chamber) if (bill_id.startswith("S") and chamber == "lower") or (bill_id.startswith("A") and chamber == "upper"): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # Construct a fake source url source_url = ("http://leginfo.legislature.ca.gov/faces/" "billNavClient.xhtml?bill_id=%s") % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type="text/html") title = "" type_ = ["bill"] subject = "" all_titles = set() summary = "" # Get digest test (aka "summary") from latest version. if bill.versions and not_archive_year: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = "//caml:DigestText/xhtml:p" els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r"\s+", " ", t) t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t) chunks.append(t) summary = "\n\n".join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime("%m/%d/%y") version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type="application/pdf", date=version_date.date(), ) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ("AB", "SB"): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len( version.short_title) and not version.title.lower( ).startswith("an act"): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == "Yes": type_.append("appropriation") tags = [] if version.fiscal_committee == "Yes": tags.append("fiscal committee") if version.local_program == "Yes": tags.append("local program") if version.urgency == "Yes": tags.append("urgency") if version.taxlevy == "Yes": tags.append("tax levy") if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note="summary") fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras["impact_clause"] = impact_clause fsbill.extras["tags"] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == "Y", entity_type="person", ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r"(Assembly|Senate)($| \(Floor)", actor) if match: actor = { "Assembly": "lower", "Senate": "upper" }[match.group(1)] elif actor.startswith("Governor"): actor = "executive" else: def replacer(matchobj): if matchobj: return { "Assembly": "lower", "Senate": "upper" }[matchobj.group()] else: return matchobj.group() actor = re.sub(r"^(Assembly|Senate)", replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r"\s+", " ", act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r"Com[s]?. on", action.action) and not matched_abbrs: msg = "Failed to extract committee abbr from %r." self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ("Mapping contains no committee name for " "abbreviation %r. Action text was %r.") args = (abbr, action.action) self.warning(msg % args) committees = filter(None, committees) kwargs["committees"] = committees code = re.search(r"C[SXZ]\d+", actor) if code is not None: code = code.group() kwargs["actor_info"] = {"committee_code": code} if not_archive_year: assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace("Coms. on ", "") act_str = act_str.replace("Com. on " + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith("."): act_str = act_str + "." # Determine which chamber the action originated from. changed = False for committee_chamber in ["upper", "lower", "legislature"]: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = "legislature" if actor != action.actor: actor_info = kwargs.get("actor_info", {}) actor_info["details"] = action.actor kwargs["actor_info"] = actor_info # Add strings for related legislators, if any. rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+" legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs["legislators"] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime("%Y-%m-%d"), chamber=actor, classification=kwargs["classification"], ) for committee in kwargs.get("committees", []): action.add_related_entity(committee, entity_type="organization") seen_actions.add((actor, act_str, date)) source_url = ( "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?" ) source_url += f"bill_id={session}{bill.session_num}{fsbill.identifier}" # Votes for non archived years if archive_year > 2009: for vote_num, vote in enumerate(bill.votes): if vote.vote_result == "(PASS)": result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(" ")[0].lower() if first_part in ["asm", "assembly"]: vote_chamber = "lower" # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith("sen"): vote_chamber = "upper" # vote_location = ' '.join(full_loc.split(' ')[1:]) else: # raise ScrapeError("Bad location: %s" % full_loc) # To uncomment continue if vote.motion: motion = vote.motion.motion_text or "" else: motion = "" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" motion = motion.strip() motion = re.compile(r"(\w+)( Extraordinary)? Session$", re.IGNORECASE).sub("", motion) motion = re.compile(r"^(Senate|Assembly) ", re.IGNORECASE).sub("", motion) motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ", "", motion) motion = re.sub(r" \(\w+\)$", "", motion) motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "", motion) motion = re.sub( r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? " r"Urgency Clause$", "(Urgency Clause)", motion, ) motion = re.sub(r"\s+", " ", motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result="pass" if result else "fail", classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {"threshold": vote.threshold} fsvote.add_source(source_url) fsvote.pupa_id = source_url + "#" + str(vote_num) rc = {"yes": [], "no": [], "other": []} for record in vote.votes: if record.vote_code == "AYE": rc["yes"].append(record.legislator_name) elif record.vote_code.startswith("NO"): rc["no"].append(record.legislator_name) else: rc["other"].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote if len(bill.votes) > 0 and archive_year <= 2009: vote_page_url = ( "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?" ) vote_page_url += ( f"bill_id={session}{bill.session_num}{fsbill.identifier}") # parse the bill data page, finding the latest html text data = self.get(vote_page_url).content doc = html.fromstring(data) doc.make_links_absolute(vote_page_url) num_of_votes = len(doc.xpath("//div[@class='status']")) for vote_section in range(1, num_of_votes + 1): lines = doc.xpath( f"//div[@class='status'][{vote_section}]//div[@class='statusRow']" ) date, result, motion, vtype, location = "", "", "", "", "" votes = {} for line in lines: line = line.text_content().split() if line[0] == "Date": date = line[1] date = datetime.datetime.strptime(date, "%m/%d/%y") date = self._tz.localize(date) elif line[0] == "Result": result = "pass" if "PASS" in line[1] else "fail" elif line[0] == "Motion": motion = " ".join(line[1:]) elif line[0] == "Location": location = " ".join(line[1:]) elif len(line) > 1: if line[0] == "Ayes" and line[1] != "Count": votes["yes"] = line[1:] elif line[0] == "Noes" and line[1] != "Count": votes["no"] = line[1:] elif line[0] == "NVR" and line[1] != "Count": votes["not voting"] = line[1:] # Determine chamber based on location first_part = location.split(" ")[0].lower() vote_chamber = "" if first_part in ["asm", "assembly"]: vote_chamber = "lower" elif first_part.startswith("sen"): vote_chamber = "upper" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" if len(motion) > 0: fsvote = VoteEvent( motion_text=motion, start_date=date, result=result, classification=vtype, chamber=vote_chamber, bill=fsbill, ) fsvote.add_source(vote_page_url) fsvote.pupa_id = vote_page_url + "#" + str( vote_section) for how_voted, voters in votes.items(): for voter in voters: voter = voter.replace(",", "") fsvote.vote(how_voted, voter) yield fsvote yield fsbill self.session.expire_all()
def get_bill(self, matter): '''Make Bill object from given matter.''' ''' Currently, NYC Legistar does not have conventional "Types" for three newly added committees: https://legistar.council.nyc.gov/Departments.aspx We communicated the issue to NYC, and until we learn more, we will skip the bills attached to those committees. ''' orgs_without_type = [ 'Charter Revision Commission 2019', 'New York City Advisory Commission on Property Tax Reform', 'Democratic Conference of the Council of the City of New York' ] if matter['MatterBodyName'].strip() in orgs_without_type: return None matter_id = matter['MatterId'] if matter_id in DUPLICATED_ACTIONS: return None date = matter['MatterIntroDate'] title = matter['MatterName'] identifier = matter['MatterFile'] if not all((date, title, identifier)): return None leg_type = BILL_TYPES[matter['MatterTypeName']] bill_session = self.sessions(self.toTime(date)) bill = Bill(identifier=identifier, title=title, classification=leg_type, legislative_session=bill_session, from_organization={"name": "New York City Council"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') if matter['MatterTitle']: bill.add_title(matter['MatterTitle']) if matter['MatterEXText5']: bill.add_abstract(matter['MatterEXText5'], note='') try: for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) except KeyError: self.version_errors.append(legistar_web) return None for attachment in self.attachments(matter_id): if attachment['MatterAttachmentId'] == 103315: # Duplicate return None if attachment['MatterAttachmentName']: bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type='application/pdf') for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: return None else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') try: text = self.text(matter_id) except KeyError: self.version_errors.append(legistar_web) return None bill.extras['local_classification'] = matter['MatterTypeName'] if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'].replace( u'\u0000', '') if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') return bill
def scrape_bill(self, bill_id): old = self.api('bills/' + bill_id + '?') # not needed old.pop('id') old.pop('state') old.pop('level', None) old.pop('country', None) old.pop('created_at') old.pop('updated_at') old.pop('action_dates') old.pop('+bill_type',None) old.pop('+subject', None) old.pop('+scraped_subjects', None) old.pop('subjects', []) classification = old.pop('type') # ca weirdness if 'fiscal committee' in classification: classification.remove('fiscal committee') if 'urgency' in classification: classification.remove('urgency') if 'local program' in classification: classification.remove('local program') if 'tax levy' in classification: classification.remove('tax levy') if classification[0] in ['miscellaneous', 'jres', 'cres']: return if classification == ['memorial resolution'] and self.state == 'ar': classification = ['memorial'] if classification == ['concurrent memorial resolution'] and self.state == 'ar': classification = ['concurrent memorial'] if classification == ['joint session resolution'] and self.state == 'il': classification = ['joint resolution'] if classification == ['legislative resolution'] and self.state == 'ny': classification = ['resolution'] if classification == ['address'] and self.state == 'nh': classification = ['resolution'] if not old['title'] and self.state == 'me': old['title'] = '(unknown)' chamber = old.pop('chamber') if self.state in ('ne', 'dc'): chamber = 'legislature' elif chamber in ('joint', 'conference'): chamber = 'legislature' new = Bill(old.pop('bill_id'), old.pop('session'), old.pop('title'), chamber=chamber, classification=classification) abstract = old.pop('summary', None) if abstract: new.add_abstract(abstract, note='') for title in old.pop('alternate_titles'): new.add_title(title) for doc in old.pop('documents'): new.add_document_link(doc['name'], doc['url'], on_duplicate='ignore') for doc in old.pop('versions'): new.add_version_link(doc['name'], doc['url'], media_type=doc.pop('mimetype', '')) for subj in old.pop('scraped_subjects', []): if subj: new.add_subject(subj) for spon in old.pop('sponsors'): if spon.get('committee_id') is not None: entity_type = 'organization' elif spon.get('leg_id') is not None: entity_type = 'person' else: entity_type = '' new.add_sponsorship(spon['name'], spon['type'], entity_type, spon['type'] == 'primary') for act in old.pop('actions'): actor = act['actor'] if actor.lower() in ('governor', 'mayor', 'secretary of state'): actor = 'executive' elif actor.lower() == 'house' or (actor.lower().startswith('lower (') and self.state == 'ca'): actor = 'lower' elif actor.lower() in ('senate', 'upper`') or (actor.lower().startswith('upper (') and self.state == 'ca'): actor = 'upper' elif actor in ('joint', 'other', 'Data Systems', 'Speaker', 'clerk', 'Office of the Legislative Fiscal Analyst', 'Became Law w', 'conference') or (actor.lower().startswith('legislature (') and self.state == 'ca'): actor = 'legislature' if actor in ('committee', 'sponsor') and self.state == 'pr': actor = 'legislature' # nebraska & DC if actor in ('upper','council') and self.state in ('ne', 'dc'): actor = 'legislature' if act['action']: newact = new.add_action(act['action'], act['date'][:10], chamber=actor, classification=[action_types[c] for c in act['type'] if c != 'other']) for re in act.get('related_entities', []): if re['type'] == 'committee': re['type'] = 'organization' elif re['type'] == 'legislator': re['type'] = 'person' newact.add_related_entity(re['name'], re['type']) for comp in old.pop('companions', []): if self.state in ('nj', 'ny', 'mn'): rtype = 'companion' new.add_related_bill(comp['bill_id'], comp['session'], rtype) for abid in old.pop('alternate_bill_ids', []) + old.pop('+alternate_bill_ids', []): new.add_identifier(abid) # generic OpenStates stuff for id in old.pop('all_ids'): new.add_identifier(id, scheme='openstates') for source in old.pop('sources'): source.pop('retrieved', None) new.add_source(**source) ext_title = old.pop('+extended_title', None) if ext_title: new.add_title(ext_title, note='Extended Title') official_title = old.pop('+official_title', None) if official_title: new.add_title(official_title, note='Official Title') to_extras = ['+status', '+final_disposition', '+volume_chapter', '+ld_number', '+referral', '+companion', '+description', '+fiscal_note_probable:', '+preintroduction_required:', '+drafter', '+category:', '+chapter', '+requester', '+transmittal_date:', '+by_request_of', '+bill_draft_number:', '+bill_lr', '+bill_url', '+rcs_num', '+fiscal_note', '+impact_clause', '+fiscal_notes', '+short_title', '+type_', '+conference_committee', 'conference_committee', '+companion_bill_ids', '+additional_information'] for k in to_extras: v = old.pop(k, None) if v: new.extras[k.replace('+', '')] = v # votes vote_no = 1 for vote in old.pop('votes'): vote.pop('id') vote.pop('state') vote.pop('bill_id') vote.pop('bill_chamber', None) vote.pop('+state', None) vote.pop('+country', None) vote.pop('+level', None) vote.pop('+vacant', None) vote.pop('+not_voting', None) vote.pop('+amended', None) vote.pop('+excused', None) vote.pop('+NV', None) vote.pop('+AB', None) vote.pop('+P', None) vote.pop('+V', None) vote.pop('+E', None) vote.pop('+EXC', None) vote.pop('+EMER', None) vote.pop('+present', None) vote.pop('+absent', None) vote.pop('+seconded', None) vote.pop('+moved', None) vote.pop('+vote_type', None) vote.pop('+actual_vote', None) vote.pop('+skip_votes', None) vote.pop('vote_id') vote.pop('+bill_chamber', None) vote.pop('+session', None) vote.pop('+bill_id', None) vote.pop('+bill_session', None) vote.pop('committee', None) vote.pop('committee_id', None) vtype = vote.pop('type', 'passage') if vtype == 'veto_override': vtype = ['veto-override'] elif vtype == 'amendment': vtype = ['amendment-passage'] elif vtype == 'other': vtype = '' else: vtype = ['bill-passage'] # most states need identifiers for uniqueness, just do it everywhere identifier = vote['date'] + '-' + str(vote_no) vote_no += 1 chamber = vote.pop('chamber') if chamber == 'upper' and self.state in ('ne', 'dc'): chamber = 'legislature' elif chamber == 'joint': chamber = 'legislature' newvote = VoteEvent(legislative_session=vote.pop('session'), motion_text=vote.pop('motion'), result='pass' if vote.pop('passed') else 'fail', chamber=chamber, start_date=vote.pop('date'), classification=vtype, bill=new, identifier=identifier) for vt in ('yes', 'no', 'other'): newvote.set_count(vt, vote.pop(vt + '_count')) for name in vote.pop(vt + '_votes'): newvote.vote(vt, name['name']) for source in vote.pop('sources'): source.pop('retrieved', None) newvote.add_source(**source) if not newvote.sources: newvote.sources = new.sources to_extras = ['+record', '+method', 'method', '+filename', 'record', '+action', '+location', '+rcs_num', '+type_', '+threshold', '+other_vote_detail', '+voice_vote'] for k in to_extras: v = vote.pop(k, None) if v: newvote.extras[k.replace('+', '')] = v assert not vote, vote.keys() yield newvote assert not old, old.keys() yield new
def scrape_chamber(self, chamber, session): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? bill_request = self.get(ksapi.url + 'bill_status/').text bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE'] # main bill = Bill( bill_id, session, title, chamber=chamber, classification=btype, ) bill.extras = {'status': bill_data['STATUS']} bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if (bill_data['LONGTITLE'] and bill_data['LONGTITLE'] != bill.title): bill.add_title(bill_data['LONGTITLE']) for sponsor in bill_data['SPONSOR_NAMES']: stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else 'cosponsor') if sponsor: bill.add_sponsorship( name=sponsor, entity_type='person', primary=stype == 'primary', classification=stype, ) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S") # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning( 'unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = None else: atype = ksapi.action_codes[event['action_code']] bill.add_action(action, date.strftime('%Y-%m-%d'), chamber=actor, classification=atype) try: yield from self.scrape_html(bill, session) except scrapelib.HTTPError as e: self.warning('unable to fetch HTML for bill {0}'.format( bill['bill_id'])) yield bill
def scrape(self): for leg_summary in self.legislation(created_after=datetime.datetime(2014, 1, 1)) : leg_type = BILL_TYPES[leg_summary['Type']] bill = Bill(identifier=leg_summary['File\xa0#'], title=leg_summary['Title'], legislative_session=None, classification=leg_type, from_organization={"name":"New York City Council"}) bill.add_source(leg_summary['url']) leg_details = self.legDetails(leg_summary['url']) history = self.history(leg_summary['url']) bill.add_title(leg_details['Name'], note='created by administrative staff') if 'Summary' in leg_details : bill.add_abstract(leg_details['Summary'], note='') if leg_details['Law number'] : bill.add_identifier(leg_details['Law number'], note='law number') for sponsorship in self._sponsors(leg_details.get('Sponsors', [])) : sponsor, sponsorship_type, primary = sponsorship bill.add_sponsorship(sponsor, sponsorship_type, 'person', primary, entity_id = make_pseudo_id(name=sponsor)) for attachment in leg_details.get('Attachments', []) : bill.add_document_link(attachment['label'], attachment['url'], media_type="application/pdf") history = list(history) if history : earliest_action = min(self.toTime(action['Date']) for action in history) bill.legislative_session = self.sessions(earliest_action) else : bill.legislative_session = str(self.SESSION_STARTS[0]) for action in history : action_description = action['Action'] if not action_description : continue action_class = ACTION_CLASSIFICATION[action_description] action_date = self.toDate(action['Date']) responsible_org = action['Action\xa0By'] if responsible_org == 'City Council' : responsible_org = 'New York City Council' elif responsible_org == 'Administration' : responsible_org = 'Mayor' if responsible_org == 'Town Hall Meeting' : continue else : act = bill.add_action(action_description, action_date, organization={'name': responsible_org}, classification=action_class) if 'url' in action['Action\xa0Details'] : action_detail_url = action['Action\xa0Details']['url'] if action_class == 'committee-referral' : action_details = self.actionDetails(action_detail_url) referred_committee = action_details['Action text'].rsplit(' to the ', 1)[-1] act.add_related_entity(referred_committee, 'organization', entity_id = make_pseudo_id(name=referred_committee)) result, votes = self.extractVotes(action_detail_url) if votes : action_vote = VoteEvent(legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=action_class, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes : action_vote.vote(option, voter) yield action_vote text = self.text(leg_summary['url']) if text : bill.extras = {'local_classification' : leg_summary['Type'], 'full_text' : text} else : bill.extras = {'local_classification' : leg_summary['Type']} yield bill
def scrape_bill(self, row, chamber, session): bill_id = row['LegislationDisplayCode'] # hack for empty StatusName statusless_bills = ['HA 2 to SS 1 for SB 5', 'HA 3 to SS 1 for SB 5'] is_force_substitute = bill_id in statusless_bills \ and row['StatusName'] is None is_substituted = is_force_substitute or 'Substituted' in row['StatusName'] \ if is_substituted: # skip substituted bills, the replacement is picked up instead self.warning('skipping %s: %s', bill_id, row['StatusName']) return substitute = None if bill_id.count(' ') > 1: if 'w/' in bill_id or 'SA' in bill_id or 'HA' in bill_id: # TODO: re-evaluate if these should be separate bills self.warning('skipping amendment %s', bill_id) return elif ' for ' in bill_id: self.info( "Found substitute to use instead: `{}`".format(bill_id)) substitute, bill_id = bill_id.split(' for ') else: raise ValueError('unknown bill_id format: ' + bill_id) bill_type = self.classify_bill(bill_id) bill = Bill(identifier=bill_id, legislative_session=session, chamber=chamber, title=row['LongTitle'], classification=bill_type) if row['Synopsis']: bill.add_abstract(row['Synopsis'], 'synopsis') if row['ShortTitle']: bill.add_title(row['ShortTitle'], 'short title') if row['SponsorPersonId']: self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary') if substitute: bill.extras['substitute'] = substitute # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API? html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format( row['LegislationId']) bill.add_source(html_url, note='text/html') html = self.lxmlize(html_url) # Additional Sponsors: '//label[text()="Additional Sponsor(s):"]/following-sibling::div/a' additional_sponsors = html.xpath( '//label[text()="Additional Sponsor(s):"]' '/following-sibling::div/a/@href') for sponsor_url in additional_sponsors: sponsor_id = sponsor_url.replace( 'https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary') # CoSponsors: '//label[text()="Co-Sponsor(s):"]/following-sibling::div/a' cosponsors = html.xpath('//label[text()="Additional Sponsor(s):"]/' 'following-sibling::div/a/@href') for sponsor_url in cosponsors: sponsor_id = sponsor_url.replace( 'https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor') versions = html.xpath( '//label[text()="Original Text:"]/following-sibling::div/a/@href') for version_url in versions: media_type = self.mime_from_link(version_url) version_name = 'Bill Text' # on_duplicate='error' bill.add_version_link(version_name, version_url, media_type=media_type) fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href') for fiscal in fiscals: self.scrape_fiscal_note(bill, fiscal) self.scrape_actions(bill, row['LegislationId']) yield from self.scrape_votes(bill, row['LegislationId'], session) yield bill
def scrape(self, session=None): if not session: session = self.latest_session() self.info('no session specified, using %s', session) # get member id matching for vote parsing member_ids = self.get_member_ids()[session] per_page = 10 # seems like it gives 10 no matter what. start_record = 0 headers = {"Content-Type": "application/json"} url = ("http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/" "GetPublicAdvancedSearch") bill_url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicData" params = { "request": { "sEcho": 2, "iColumns": 4, "sColumns": "", "iDisplayStart": 0, "iDisplayLength": per_page, "mDataProp_0": "ShortTitle", "mDataProp_1": "Title", "mDataProp_2": "LegislationCategories", "mDataProp_3": "Modified", "iSortCol_0": 0, "sSortDir_0": "asc", "iSortingCols": 0, "bSortable_0": "true", "bSortable_1": "true", "bSortable_2": "true", "bSortable_3": "true" }, "criteria": { "Keyword": "", "Category": "", "SubCategoryId": "", "RequestOf": "", "CouncilPeriod": str(session), "Introducer": "", "CoSponsor": "", "CommitteeReferral": "", "CommitteeReferralComments": "", "StartDate": "", "EndDate": "", "QueryLimit": 100, "FilterType": "", "Phases": "", "LegislationStatus": "0", "IncludeDocumentSearch": "false" } } param_json = json.dumps(params) response = self.post(url, headers=headers, data=param_json) # the response is a terrible string-of-nested-json-strings. Yuck. response = decode_json(response.json()["d"]) data = response["aaData"] global bill_versions while len(data) > 0: for bill in data: # sometimes they're in there more than once, so we'll keep track bill_versions = [] bill_id = bill["Title"] if bill_id.startswith("AG"): # actually an agenda, skip continue bill_params = {"legislationId": bill_id} bill_info = self.post(bill_url, headers=headers, data=json.dumps(bill_params)) bill_info = decode_json(bill_info.json()["d"])["data"] bill_source_url = "http://lims.dccouncil.us/Legislation/"+bill_id legislation_info = bill_info["Legislation"][0] title = legislation_info["ShortTitle"] if bill_id.startswith("R") or bill_id.startswith("CER"): bill_type = "resolution" else: bill_type = "bill" bill = Bill(bill_id, legislative_session=session, title=title, classification=bill_type) # sponsors and cosponsors if "Introducer" in legislation_info: introducers = legislation_info["Introducer"] intro_date = self.date_format(legislation_info["IntroductionDate"]) bill.add_action("Introduced", intro_date, classification="introduction") else: # sometimes there are introducers, sometimes not. # Set Introducers to empty array to avoid downstream breakage, # but log bills without introducers self.logger.warning("No Introducer: {0}".format(bill.identifier)) introducers = [] try: # sometimes there are cosponsors, sometimes not. cosponsors = legislation_info["CoSponsor"] except KeyError: cosponsors = [] for i in introducers: name = i["Name"] # they messed up Phil Mendelson's name if name == "Phil Pmendelson": name = "Phil Mendelson" bill.add_sponsorship(name, classification='primary', entity_type='person', primary=True) for s in cosponsors: name = s["Name"] if name == "Phil Pmendelson": name = "Phil Mendelson" bill.add_sponsorship(name=name, classification="cosponsor", entity_type='person', primary=False) # if it's become law, add the law number as an alternate title if "LawNumber" in legislation_info: law_num = legislation_info["LawNumber"] if law_num: bill.add_title(law_num) # also sometimes it's got an act number if "ActNumber" in legislation_info: act_num = legislation_info["ActNumber"] if act_num: bill.add_title(act_num) # sometimes AdditionalInformation has a previous bill name if "AdditionalInformation" in legislation_info: add_info = legislation_info["AdditionalInformation"] if "previously" in add_info.lower(): prev_title = add_info.lower().replace("previously", "" ).strip().replace(" ", "") bill.add_title(prev_title.upper()) elif add_info: bill.extras["additional_information"] = add_info if "WithDrawnDate" in legislation_info: withdrawn_date = self.date_format(legislation_info["WithDrawnDate"]) withdrawn_by = legislation_info["WithdrawnBy"][0]["Name"].strip() if withdrawn_by == "the Mayor": bill.add_action("withdrawn", withdrawn_date, chamber="executive", classification="withdrawal") elif "committee" in withdrawn_by.lower(): a = bill.add_action("withdrawn", withdrawn_date, classification="withdrawal") a.add_related_entity(withdrawn_by, entity_type='organization') else: a = bill.add_action("withdrawn", withdrawn_date, classification="withdrawal") a.add_related_entity(withdrawn_by, entity_type='person') # deal with actions involving the mayor mayor = bill_info["MayorReview"] if mayor != []: mayor = mayor[0] if "TransmittedDate" in mayor: transmitted_date = self.date_format(mayor["TransmittedDate"]) bill.add_action("transmitted to mayor", transmitted_date, chamber="executive", classification="executive-receipt") if 'SignedDate' in mayor: signed_date = self.date_format(mayor["SignedDate"]) bill.add_action("signed", signed_date, chamber="executive", classification="executive-signature") # if returned but not signed, it was vetoed elif 'ReturnedDate' in mayor: veto_date = self.date_format(mayor["ReturnedDate"]) bill.add_action("vetoed", veto_date, chamber="executive", classification="executive-veto") # if it was returned and enacted but not signed, there was a veto override if 'EnactedDate' in mayor: override_date = self.date_format(mayor["EnactedDate"]) bill.add_action("veto override", override_date, classification="veto-override-passage") if 'AttachmentPath' in mayor: # documents relating to the mayor's review self.add_documents(mayor["AttachmentPath"], bill) congress = bill_info["CongressReview"] if len(congress) > 0: congress = congress[0] if "TransmittedDate" in congress: transmitted_date = self.date_format(congress["TransmittedDate"]) bill.add_action("Transmitted to Congress for review", transmitted_date) # deal with committee actions if "DateRead" in legislation_info: date = legislation_info["DateRead"] elif "IntroductionDate" in legislation_info: date = legislation_info["IntroductionDate"] else: self.logger.warning("we can't find anything that looks like an " "action date. Skipping") continue date = self.date_format(date) if "CommitteeReferral" in legislation_info: committees = [] for committee in legislation_info["CommitteeReferral"]: if committee["Name"].lower() == "retained by the council": committees = [] break else: committees.append(committee["Name"]) if committees != []: a = bill.add_action("referred to committee", date, classification="referral-committee") for com in committees: a.add_related_entity(com, entity_type='organization') if "CommitteeReferralComments" in legislation_info: a = bill.add_action("comments from committee", date) for committee in legislation_info["CommitteeReferralComments"]: a.add_related_entity(committee["Name"], entity_type='organization') # deal with random docs floating around docs = bill_info["OtherDocuments"] for d in docs: if "AttachmentPath" in d: self.add_documents(d["AttachmentPath"], bill) else: self.logger.warning("Document path missing from 'Other Documents'") if "MemoLink" in legislation_info: self.add_documents(legislation_info["MemoLink"], bill) if "AttachmentPath" in legislation_info: self.add_documents(legislation_info["AttachmentPath"], bill) # full council votes votes = bill_info["VotingSummary"] for vote in votes: v = self.process_vote(vote, bill, member_ids) if v: v.add_source(bill_source_url) yield v # deal with committee votes if "CommitteeMarkup" in bill_info: committee_info = bill_info["CommitteeMarkup"] if len(committee_info) > 0: for committee_action in committee_info: v = self.process_committee_vote(committee_action, bill) if v: v.add_source(bill_source_url) yield v if "AttachmentPath" in committee_info: self.add_documents(vote["AttachmentPath"], bill) bill.add_source(bill_source_url) yield bill # get next page start_record += per_page params["request"]["iDisplayStart"] = start_record param_json = json.dumps(params) response = self.post(url, headers=headers, data=param_json) response = decode_json(response.json()["d"]) data = response["aaData"]
def _parse_senate_billpage(self, bill_url, year): bill_page = self.lxmlize(bill_url) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content() # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) bill = Bill( bill_id, title=bill_desc, chamber='upper', legislative_session=year, classification=bill_type, ) bill.subject = subs bill.add_abstract(bill_desc, note='abstract') bill.add_source(bill_url) if bill_title: bill.add_title(bill_title) # Get the primary sponsor sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() # bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'): self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//a[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] self._parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//a[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.get('href'): self._parse_senate_bill_versions(bill, versions_url[0].attrib['href']) yield bill
def scrape(self, session=None, chambers=None): # Bills endpoint can sometimes take a very long time to load self.timeout = 300 if not session: session = self.latest_session() self.info('no session, using %s', session) if int(session) < 128: raise AssertionError("No data for period {}".format(session)) elif int(session) < 131: # they changed their data format starting in 131st and added # an undocumented API yield from self.old_scrape(session) else: chamber_dict = {"Senate": "upper", "House": "lower", "House of Representatives": "lower", "house": "lower", "senate": "upper"} # so presumanbly not everything passes, but we haven't # seen anything not pass yet, so we'll need to wait # till it fails and get the right language in here vote_results = {"approved": True, "passed": True, "adopted": True, "true": True, "false": False, "failed": False, True: True, False: False} action_dict = {"ref_ctte_100": "referral-committee", "intro_100": "introduction", "intro_101": "introduction", "pass_300": "passage", "intro_110": "reading-1", "refer_210": "referral-committee", "crpt_301": None, "crpt_317": None, "concur_606": "passage", "pass_301": "passage", "refer_220": "referral-committee", "intro_102": ["introduction", "passage"], "intro_105": ["introduction", "passage"], "intro_ref_ctte_100": "referral-committee", "refer_209": None, "intro_108": ["introduction", "passage"], "intro_103": ["introduction", "passage"], "msg_reso_503": "passage", "intro_107": ["introduction", "passage"], "imm_consid_360": "passage", "refer_213": None, "adopt_reso_100": "passage", "adopt_reso_110": "passage", "msg_507": "amendment-passage", "confer_713": None, "concur_603": None, "confer_712": None, "msg_506": "amendment-failure", "receive_message_100": "passage", "motion_920": None, "concur_611": None, "confer_735": None, "third_429": None, "final_501": None, "concur_608": None, } base_url = "http://search-prod.lis.state.oh.us" first_page = base_url first_page += "/solarapi/v1/general_assembly_{session}/".format(session=session) legislators = self.get_legislator_ids(first_page) all_amendments = self.get_other_data_source(first_page, base_url, "amendments") all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals") all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss") all_analysis = self.get_other_data_source(first_page, base_url, "analysiss") for row in self.get_bill_rows(session): spacer, number_link, _ga, title, primary_sponsor, status, spacer = row.xpath('td') # S.R.No.1 -> SR1 bill_id = number_link.text_content().replace('No.', '') bill_id = bill_id.replace('.', '').replace(' ', '') # put one space back in between type and number bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id) title = title.text_content().strip() title = re.sub(r'^Title', '', title) chamber = 'lower' if 'H' in bill_id else 'upper' classification = 'bill' if 'B' in bill_id else 'resolution' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification) bill.add_source(number_link.xpath('a/@href')[0]) # get bill from API bill_api_url = ('http://search-prod.lis.state.oh.us/solarapi/v1/' 'general_assembly_{}/{}/{}/'.format( session, 'bills' if 'B' in bill_id else 'resolutions', bill_id.lower().replace(' ', '') )) data = self.get(bill_api_url).json() # add title if no short title if not bill.title: bill.title = data['items'][0]['longtitle'] bill.add_title(data['items'][0]['longtitle'], 'long title') # this stuff is version-specific for version in data['items']: version_name = version["version"] version_link = base_url+version["pdfDownloadLink"] bill.add_version_link(version_name, version_link, media_type='application/pdf') # we'll use latest bill_version for everything else bill_version = data['items'][0] bill.add_source(bill_api_url) # subjects for subj in bill_version["subjectindexes"]: try: bill.add_subject(subj["primary"]) except KeyError: pass try: secondary_subj = subj["secondary"] except KeyError: secondary_subj = "" if secondary_subj: bill.add_subject(secondary_subj) # sponsors sponsors = bill_version["sponsors"] for sponsor in sponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='primary', entity_type='person', primary=True ) cosponsors = bill_version["cosponsors"] for sponsor in cosponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='cosponsor', entity_type='person', primary=False, ) try: action_doc = self.get(base_url+bill_version["action"][0]["link"]) except scrapelib.HTTPError: pass else: actions = action_doc.json() for action in reversed(actions["items"]): actor = chamber_dict[action["chamber"]] action_desc = action["description"] try: action_type = action_dict[action["actioncode"]] except KeyError: self.warning("Unknown action {desc} with code {code}." " Add it to the action_dict" ".".format(desc=action_desc, code=action["actioncode"])) action_type = None date = self._tz.localize(datetime.datetime.strptime( action["datetime"], "%Y-%m-%dT%H:%M:%S")) date = "{:%Y-%m-%d}".format(date) bill.add_action(action_desc, date, chamber=actor, classification=action_type) # attach documents gathered earlier self.add_document(all_amendments, bill_id, "amendment", bill, base_url) self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url) self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url) self.add_document(all_analysis, bill_id, "analysis", bill, base_url) # votes vote_url = base_url+bill_version["votes"][0]["link"] vote_doc = self.get(vote_url) votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) vote_url = base_url vote_url += bill_version["cmtevotes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning("Vote page not " "loading; skipping: {}".format(vote_url)) continue votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) if data["items"][0]["effective_date"]: effective_date = datetime.datetime.strptime(data["items"][0]["effective_date"], "%Y-%m-%d") effective_date = self._tz.localize(effective_date) # the OH website adds an action that isn't in the action list JSON. # It looks like: # Effective 7/6/18 effective_date_oh = "{:%-m/%-d/%y}".format(effective_date) effective_action = "Effective {}".format(effective_date_oh) bill.add_action(effective_action, effective_date, chamber="executive", classification=["became-law"]) # we have never seen a veto or a disapprove, but they seem important. # so we'll check and throw an error if we find one # life is fragile. so are our scrapers. if "veto" in bill_version: veto_url = base_url+bill_version["veto"][0]["link"] veto_json = self.get(veto_url).json() if len(veto_json["items"]) > 0: raise AssertionError("Whoa, a veto! We've never" " gotten one before." " Go write some code to deal" " with it: {}".format(veto_url)) if "disapprove" in bill_version: disapprove_url = base_url+bill_version["disapprove"][0]["link"] disapprove_json = self.get(disapprove_url).json() if len(disapprove_json["items"]) > 0: raise AssertionError("Whoa, a disapprove! We've never" " gotten one before." " Go write some code to deal " "with it: {}".format(disapprove_url)) yield bill
def get_bill(self, matter): '''Make Bill object from given matter.''' matter_id = matter['MatterId'] if matter_id in DUPLICATED_ACTIONS: return None date = matter['MatterIntroDate'] title = matter['MatterName'] identifier = matter['MatterFile'] if not all((date, title, identifier)): return None leg_type = BILL_TYPES[matter['MatterTypeName']] bill_session = self.sessions(self.toTime(date)) bill = Bill(identifier=identifier, title=title, classification=leg_type, legislative_session=bill_session, from_organization={"name": "New York City Council"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') if matter['MatterTitle']: bill.add_title(matter['MatterTitle']) if matter['MatterEXText5']: bill.add_abstract(matter['MatterEXText5'], note='') try: for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) except KeyError: self.version_errors.append(legistar_web) return None for attachment in self.attachments(matter_id): if attachment['MatterAttachmentId'] == 103315: # Duplicate return None if attachment['MatterAttachmentName']: bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type='application/pdf') for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: return None else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') try: text = self.text(matter_id) except KeyError: self.version_errors.append(legistar_web) return None bill.extras['local_classification'] = matter['MatterTypeName'] if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'].replace( u'\u0000', '') if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') return bill
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(" ", "")) bill_page = self.get(url).text html = lxml.html.fromstring(bill_page) html.make_links_absolute( "http://legislature.idaho.gov/legislation/%s/" % session) bill_tables = html.xpath('//table[contains(@class, "bill-table")]') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill( legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type, ) bill.add_source(url) for subject in self._subjects[bill_id.replace(" ", "")]: bill.add_subject(subject) if short_title and title.lower() != short_title.lower(): bill.add_title(short_title, "short title") # documents doc_links = html.xpath('//div[contains(@class,"insert-page")]//a') for link in doc_links: name = link.text_content().strip() href = link.get("href") if "Engrossment" in name or "Bill Text" in name or "Amendment" in name: bill.add_version_link(note=name, url=href, media_type="application/pdf") else: bill.add_document_link(note=name, url=href, media_type="application/pdf") def _split(string): return re.split(r"\w+[,|AND]\s+", string) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split("by") if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: if "COMMITTEE" in sponsors.upper(): bill.add_sponsorship( name=sponsors.strip(), entity_type="organization", primary=True, classification="primary", ) else: for person in _split(sponsors): person = person.strip() if person != "": bill.add_sponsorship( classification="primary", name=person, entity_type="person", primary=True, ) actor = chamber last_date = None # if a bill has passed a chamber or been 'received from' # then the next committee passage is in the opposite chamber has_moved_chambers = False for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + "/" + session[0:4], "%m/%d/%Y").strftime("%Y-%m-%d") if action.startswith("House"): actor = "lower" elif action.startswith("Senate"): actor = "upper" # votes if "AYES" in action or "NAYS" in action: yield from self.parse_vote(actor, date, row[2], session, bill_id, chamber, url) # bill.add_vote_event(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u"\xa0", " ").strip() atype = get_action(actor, action) if atype and "passage" in atype: has_moved_chambers = True if atype and "committee-passage" in atype and has_moved_chambers: actor = _OTHER_CHAMBERS[actor] bill.add_action(action, date, chamber=actor, classification=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if "to House" in action: actor = "lower" elif "to Senate" in action: actor = "upper" yield bill
def scrape_bill(self, bill_num, session): chamber_map = {'House': 'lower', 'Senate': 'upper', 'LSO': 'executive'} # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45 bill_json_url = 'http://wyoleg.gov/LsoService/api/BillInformation/{}/' \ '{}?calendarDate='.format( session, bill_num) response = self.get(bill_json_url) bill_json = json.loads(response.content.decode('utf-8')) chamber = 'lower' if bill_json['bill'][0] else 'upper' bill = Bill( identifier=bill_json['bill'], legislative_session=session, title=bill_json['catchTitle'], chamber=chamber, classification="bill", ) bill.add_title(bill_json['billTitle']) source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format( session, bill_json['bill']) bill.add_source(source_url) for action_json in bill_json['billActions']: utc_action_date = self.parse_local_date(action_json['statusDate']) actor = None if action_json['location'] and action_json[ 'location'] in chamber_map: actor = chamber_map[action_json['location']] action = bill.add_action( chamber=actor, description=action_json['statusMessage'], date=utc_action_date, classification=categorize_action(action_json['statusMessage']), ) action.extras = { 'billInformationID': action_json['billInformationID'] } if bill_json['introduced']: url = 'http://wyoleg.gov/{}'.format(bill_json['introduced']) bill.add_version_link( note="Introduced", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['enrolledAct']: url = 'http://wyoleg.gov/{}'.format(bill_json['enrolledAct']) bill.add_version_link( note="Enrolled", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['fiscalNote']: url = 'http://wyoleg.gov/{}'.format(bill_json['fiscalNote']) bill.add_document_link( note="Fiscal Note", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['digest']: url = 'http://wyoleg.gov/{}'.format(bill_json['digest']) bill.add_document_link( note="Bill Digest", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['vetoes']: for veto in bill_json['vetoes']: url = 'http://wyoleg.gov/{}'.format(veto['vetoLinkPath']) bill.add_version_link( note=veto['vetoLinkText'], url=url, media_type="application/pdf" # optional but useful! ) for amendment in bill_json['amendments']: # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf url = 'http://wyoleg.gov/{}/Amends/{}.pdf'.format( session, amendment['amendmentNumber']) if amendment['sponsor'] and amendment['status']: title = 'Amendment {} ({}) - {} ({})'.format( amendment['amendmentNumber'], amendment['order'], amendment['sponsor'], amendment['status'], ) else: title = 'Amendment {} ({})'.format( amendment['amendmentNumber'], amendment['order'], ) # add versions of the bill text version = bill.add_version_link( note=title, url=url, media_type="application/pdf", ) version['extras'] = { 'amendmentNumber': amendment['amendmentNumber'], 'sponsor': amendment['sponsor'], } for sponsor in bill_json['sponsors']: status = 'primary' if sponsor['primarySponsor'] else 'cosponsor' sponsor_type = 'person' if sponsor[ 'sponsorTitle'] else 'organization' bill.add_sponsorship(name=sponsor['name'], classification=status, entity_type=sponsor_type, primary=sponsor['primarySponsor']) if bill_json['summary']: bill.add_abstract( note="summary", abstract=bill_json['summary'], ) if bill_json['enrolledNumber']: bill.extras['wy_enrolled_number'] = bill_json['enrolledNumber'] if bill_json['chapter']: bill.extras['chapter'] = bill_json['chapter'] if bill_json['effectiveDate']: eff = datetime.datetime.strptime(bill_json['effectiveDate'], '%m/%d/%Y') bill.extras['effective_date'] = eff.strftime('%Y-%m-%d') bill.extras['wy_bill_id'] = bill_json['id'] for vote_json in bill_json['rollCalls']: yield from self.scrape_vote(bill, vote_json, session) yield bill
def scrape_chamber(self, chamber, session): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? bill_request = self.get(ksapi.url + 'bill_status/').text bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE'] # main bill = Bill( bill_id, session, title, chamber=chamber, classification=btype, ) bill.extras = {'status': bill_data['STATUS']} bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if (bill_data['LONGTITLE'] and bill_data['LONGTITLE'] != bill.title): bill.add_title(bill_data['LONGTITLE']) # An "original sponsor" is the API's expression of "primary sponsor" for primary_sponsor in bill_data['ORIGINAL_SPONSOR']: bill.add_sponsorship( name=primary_sponsor, entity_type='organization' if "committee" in primary_sponsor.lower() else 'person', primary=True, classification="original sponsor" ) for sponsor in bill_data['SPONSOR_NAMES']: if sponsor in bill_data['ORIGINAL_SPONSOR']: continue bill.add_sponsorship( name=sponsor, entity_type='organization' if "committee" in sponsor.lower() else 'person', primary=False, classification='cosponsor', ) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = event['session_date'] # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning('unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = None else: atype = ksapi.action_codes[event['action_code']] bill.add_action( action, date, chamber=actor, classification=atype) # Versions are exposed in `bill_data['versions'], # but lack any descriptive text or identifiers; # continue to scrape these from the HTML yield from self.scrape_html(bill, session) yield bill
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): bills = self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title='', chamber=chamber) if ((bill_id.startswith('S') and chamber == 'lower') or (bill_id.startswith('A') and chamber == 'upper')): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type='text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime('%m/%d/%y') version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link(version_name, version_url_pdf, media_type='application/pdf', date=version_date.date()) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') tags = [] if version.fiscal_committee == 'Yes': tags.append('fiscal committee') if version.local_program == 'Yes': tags.append('local program') if version.urgency == 'Yes': tags.append('urgency') if version.taxlevy == 'Yes': tags.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note='summary') fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras['impact_clause'] = impact_clause fsbill.extras['tags'] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == 'Y', entity_type='person', ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = { 'Assembly': 'lower', 'Senate': 'upper' }[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: def replacer(matchobj): if matchobj: return { 'Assembly': 'lower', 'Senate': 'upper' }[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r'Com[s]?. on', action.action) and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Coms. on ', '') act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith('.'): act_str = act_str + '.' # Determine which chamber the action originated from. changed = False for committee_chamber in ['upper', 'lower', 'legislature']: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = 'legislature' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime('%Y-%m-%d'), chamber=actor, classification=kwargs['classification']) for committee in kwargs.get('committees', []): action.add_related_entity(committee, entity_type='organization') seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == '(PASS)': result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub( r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result='pass' if result else 'fail', classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {'threshold': vote.threshold} source_url = ('http://leginfo.legislature.ca.gov/faces' '/billVotesClient.xhtml?bill_id={}').format( fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + '#' + str(vote_num) rc = {'yes': [], 'no': [], 'other': []} for record in vote.votes: if record.vote_code == 'AYE': rc['yes'].append(record.legislator_name) elif record.vote_code.startswith('NO'): rc['no'].append(record.legislator_name) else: rc['other'].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()
def _parse_senate_billpage(self, bill_url, year): bill_page = self.lxmlize(bill_url) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath( '//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath( '//*[@id="lblBriefDesc"]')[0].text_content() # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bid == 'XXXXXX': self.info("Skipping Junk Bill") return bill = Bill( bill_id, title=bill_desc, chamber='upper', legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_abstract(bill_desc, note='abstract') bill.add_source(bill_url) if bill_title: bill.add_title(bill_title) # Get the primary sponsor sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() # bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'): self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//a[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] self._parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//a[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.get('href'): self._parse_senate_bill_versions(bill, versions_url[0].attrib['href']) amendment_links = bill_page.xpath( '//a[contains(@href,"ShowAmendment.asp")]') for link in amendment_links: link_text = link.xpath('string(.)').strip() if 'adopted' in link_text.lower(): link_url = link.xpath('@href')[0] bill.add_version_link(link_text, link_url, media_type='application/pdf', on_duplicate='ignore') yield bill
def scrape(self, session=None, chambers=None): # Bills endpoint can sometimes take a very long time to load self.timeout = 300 if not session: session = self.latest_session() self.info('no session, using %s', session) if int(session) < 128: raise AssertionError("No data for period {}".format(session)) elif int(session) < 131: # they changed their data format starting in 131st and added # an undocumented API yield from self.old_scrape(session) else: chamber_dict = {"Senate": "upper", "House": "lower", "House of Representatives": "lower", "house": "lower", "senate": "upper"} # so presumanbly not everything passes, but we haven't # seen anything not pass yet, so we'll need to wait # till it fails and get the right language in here vote_results = {"approved": True, "passed": True, "adopted": True, "true": True, "false": False, "failed": False, True: True, False: False} action_dict = {"ref_ctte_100": "referral-committee", "intro_100": "introduction", "pass_300": "passage", "intro_110": "reading-1", "refer_210": "referral-committee", "crpt_301": None, "crpt_317": None, "concur_606": "passage", "pass_301": "passage", "refer_220": "referral-committee", "intro_102": ["introduction", "passage"], "intro_105": ["introduction", "passage"], "intro_ref_ctte_100": "referral-committee", "refer_209": None, "intro_108": ["introduction", "passage"], "intro_103": ["introduction", "passage"], "msg_reso_503": "passage", "intro_107": ["introduction", "passage"], "imm_consid_360": "passage", "refer_213": None, "adopt_reso_100": "passage", "msg_507": "amendment-passage", "confer_713": None, "concur_603": None, "confer_712": None, "msg_506": "amendment-failure", "receive_message_100": "passage", "motion_920": None, "concur_611": None, "confer_735": None } base_url = "http://search-prod.lis.state.oh.us" first_page = base_url first_page += "/solarapi/v1/general_assembly_{session}/".format(session=session) legislators = self.get_legislator_ids(first_page) all_amendments = self.get_other_data_source(first_page, base_url, "amendments") all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals") all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss") all_analysis = self.get_other_data_source(first_page, base_url, "analysiss") for row in self.get_bill_rows(session): number_link, ga, title, primary_sponsor, status = row.xpath('td') bill_id = number_link.text_content() title = title.text_content().strip() chamber = 'lower' if 'H' in bill_id else 'upper' classification = 'bill' if 'B' in bill_id else 'resolution' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification) bill.add_source(number_link.xpath('a/@href')[0]) # get bill from API bill_api_url = ('http://search-prod.lis.state.oh.us/solarapi/v1/' 'general_assembly_{}/{}/{}/'.format( session, 'bills' if 'B' in bill_id else 'resolutions', bill_id.lower().replace(' ', '') )) data = self.get(bill_api_url).json() # add title if no short title if not bill.title: bill.title = data['items'][0]['longtitle'] bill.add_title(data['items'][0]['longtitle'], 'long title') # this stuff is version-specific for version in data['items']: version_name = version["version"] version_link = base_url+version["pdfDownloadLink"] bill.add_version_link(version_name, version_link, media_type='application/pdf') # we'll use latest bill_version for everything else bill_version = data['items'][0] bill.add_source(bill_api_url) # subjects for subj in bill_version["subjectindexes"]: try: bill.add_subject(subj["primary"]) except KeyError: pass try: secondary_subj = subj["secondary"] except KeyError: secondary_subj = "" if secondary_subj: bill.add_subject(secondary_subj) # sponsors sponsors = bill_version["sponsors"] for sponsor in sponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='primary', entity_type='person', primary=True ) cosponsors = bill_version["cosponsors"] for sponsor in cosponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='cosponsor', entity_type='person', primary=False, ) try: action_doc = self.get(base_url+bill_version["action"][0]["link"]) except scrapelib.HTTPError: pass else: actions = action_doc.json() for action in reversed(actions["items"]): actor = chamber_dict[action["chamber"]] action_desc = action["description"] try: action_type = action_dict[action["actioncode"]] except KeyError: self.warning("Unknown action {desc} with code {code}." " Add it to the action_dict" ".".format(desc=action_desc, code=action["actioncode"])) action_type = None date = self._tz.localize(datetime.datetime.strptime( action["datetime"], "%Y-%m-%dT%H:%M:%S")) date = "{:%Y-%m-%d}".format(date) bill.add_action(action_desc, date, chamber=actor, classification=action_type) # attach documents gathered earlier self.add_document(all_amendments, bill_id, "amendment", bill, base_url) self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url) self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url) self.add_document(all_analysis, bill_id, "analysis", bill, base_url) # votes vote_url = base_url+bill_version["votes"][0]["link"] vote_doc = self.get(vote_url) votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) vote_url = base_url vote_url += bill_version["cmtevotes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning("Vote page not " "loading; skipping: {}".format(vote_url)) continue votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) # we have never seen a veto or a disapprove, but they seem important. # so we'll check and throw an error if we find one # life is fragile. so are our scrapers. if "veto" in bill_version: veto_url = base_url+bill_version["veto"][0]["link"] veto_json = self.get(veto_url).json() if len(veto_json["items"]) > 0: raise AssertionError("Whoa, a veto! We've never" " gotten one before." " Go write some code to deal" " with it: {}".format(veto_url)) if "disapprove" in bill_version: disapprove_url = base_url+bill_version["disapprove"][0]["link"] disapprove_json = self.get(disapprove_url).json() if len(disapprove_json["items"]) > 0: raise AssertionError("Whoa, a disapprove! We've never" " gotten one before." " Go write some code to deal " "with it: {}".format(disapprove_url)) yield bill
def _parse_house_bill(self, url, session): # using the print page makes the page simpler, and also *drastically* smaller # (8k rather than 100k) url = re.sub("billsummary", "billsummaryprn", url) url = '%s/%s' % (self._house_base_url, url) # the URL is an iframed version now, so swap in for the actual bill page url = url.replace('Bill.aspx', 'BillContent.aspx') url = url.replace('&code=R', '&code=R&style=new') # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new bill_page = self.get(url).text bill_page = lxml.html.fromstring(bill_page) bill_page.make_links_absolute(url) bill_id = bill_page.xpath('//*[@class="entry-title"]/div') if len(bill_id) == 0: self.info("WARNING: bill summary page is blank! (%s)" % url) self._bad_urls.append(url) return bill_id = bill_id[0].text_content() bill_id = clean_text(bill_id) bill_desc = bill_page.xpath( '//*[@class="BillDescription"]')[0].text_content() bill_desc = clean_text(bill_desc) table_rows = bill_page.xpath('//table/tr') # if there is a cosponsor all the rows are pushed down one for the extra row # for the cosponsor: cosponsorOffset = 0 if table_rows[2][0].text_content().strip() == 'Co-Sponsor:': cosponsorOffset = 1 lr_label_tag = table_rows[3 + cosponsorOffset] assert lr_label_tag[0].text_content().strip() == 'LR Number:' # bill_lr = lr_label_tag[1].text_content() lastActionOffset = 0 if table_rows[4 + cosponsorOffset][0].text_content().strip( ) == 'Governor Action:': lastActionOffset = 1 official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset] assert official_title_tag[0].text_content().strip() == 'Bill String:' official_title = official_title_tag[1].text_content() # could substitute the description for the name, # but keeping it separate for now. bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] bill_number = int(bill_id[3:].strip()) else: bill_number = int(bill_id[3:]) subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bill_desc == "": if bill_number <= 20: # blank bill titles early in session are approp. bills bill_desc = 'Appropriations Bill' else: self.error("Blank title. Skipping. {} / {} / {}".format( bill_id, bill_desc, official_title)) return bill = Bill( bill_id, chamber='lower', title=bill_desc, legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_title(official_title, note='official') bill.add_source(url) bill_sponsor = clean_text(table_rows[0][1].text_content()) # try: # bill_sponsor_link = table_rows[0][1][0].attrib['href'] # except IndexError: # return bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # check for cosponsors sponsors_url, = bill_page.xpath( "//a[contains(@href, 'CoSponsors.aspx')]/@href") self._parse_cosponsors_from_bill(bill, sponsors_url) # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0] # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href']) # actions_link = re.sub("content", "print", actions_link) actions_link, = bill_page.xpath( "//a[contains(@href, 'BillActions.aspx')]/@href") yield from self._parse_house_actions(bill, actions_link) # get bill versions doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span') for doc_tag in reversed(doc_tags): doc = clean_text(doc_tag.text_content()) text_url = '%s%s' % (self._house_base_url, doc_tag[0].attrib['href']) bill.add_document_link(doc, text_url, media_type='text/html') # get bill versions version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span') for version_tag in reversed(version_tags): version = clean_text(version_tag.text_content()) for vurl in version_tag.xpath(".//a"): if vurl.text == 'PDF': mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(version, vurl.attrib['href'], media_type=mimetype, on_duplicate='ignore') # house bill versions # everything between the row containing "Bill Text"" and the next div.DocHeaderRow version_rows = bill_page.xpath( '//div[contains(text(),"Bill Text")]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]' ) for row in version_rows: # some rows are just broken links, not real versions if row.xpath('.//div[contains(@class,"textType")]/a/@href'): version = row.xpath( './/div[contains(@class,"textType")]/a/text()')[0].strip() path = row.xpath( './/div[contains(@class,"textType")]/a/@href')[0].strip() if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(version, path, media_type=mimetype, on_duplicate='ignore') # house bill summaries # everything between the row containing "Bill Summary"" and the next div.DocHeaderRow summary_rows = bill_page.xpath( '//div[contains(text(),"Bill Summary")]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]' ) # if there are no amedments, we need a different xpath for summaries if not summary_rows: summary_rows = bill_page.xpath( '//div[contains(text(),"Bill Summary")]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(summary_rows): version = row.xpath( './/div[contains(@class,"textType")]/a/text()')[0].strip() if version: path = row.xpath( './/div[contains(@class,"textType")]/a/@href')[0].strip() summary_name = 'Bill Summary ({})'.format(version) if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_document_link(summary_name, path, media_type=mimetype, on_duplicate='ignore') # house bill amendments amendment_rows = bill_page.xpath( '//div[contains(text(),"Amendment")]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(amendment_rows): version = row.xpath( './/div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip( ) path = row.xpath( './/div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip( ) summary_name = 'Amendment {}'.format(version) defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]') if defeated_icon: summary_name = '{} (Defeated)'.format(summary_name) adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]') if adopted_icon: summary_name = '{} (Adopted)'.format(summary_name) distributed_icon = row.xpath( './/img[contains(@title,"Distributed")]') if distributed_icon: summary_name = '{} (Distributed)'.format(summary_name) if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(summary_name, path, media_type=mimetype, on_duplicate='ignore') yield bill
def scrape_chamber(self, chamber, session): chamber_name = "Senate" if chamber == "upper" else "House" chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? bill_request = self.get(ksapi.url + "bill_status/").text bill_request_json = json.loads(bill_request) bills = bill_request_json["content"] for bill_data in bills: bill_id = bill_data["BILLNO"] # filter other chambers if not bill_id.startswith(chamber_letter): continue if "CR" in bill_id: btype = "concurrent resolution" elif "R" in bill_id: btype = "resolution" elif "B" in bill_id: btype = "bill" title = bill_data["SHORTTITLE"] or bill_data["LONGTITLE"] # main bill = Bill(bill_id, session, title, chamber=chamber, classification=btype) bill.extras = {"status": bill_data["STATUS"]} bill.add_source(ksapi.url + "bill_status/" + bill_id.lower()) if bill_data["LONGTITLE"] and bill_data["LONGTITLE"] != bill.title: bill.add_title(bill_data["LONGTITLE"]) # An "original sponsor" is the API's expression of "primary sponsor" for primary_sponsor in bill_data["ORIGINAL_SPONSOR"]: bill.add_sponsorship( name=primary_sponsor, entity_type="organization" if "committee" in primary_sponsor.lower() else "person", primary=True, classification="original sponsor", ) for sponsor in bill_data["SPONSOR_NAMES"]: if sponsor in bill_data["ORIGINAL_SPONSOR"]: continue bill.add_sponsorship( name=sponsor, entity_type="organization" if "committee" in sponsor.lower() else "person", primary=False, classification="cosponsor", ) # history is backwards for event in reversed(bill_data["HISTORY"]): actor = "upper" if event["chamber"] == "Senate" else "lower" date = event["session_date"] # append committee names if present if "committee_names" in event: action = (event["status"] + " " + " and ".join(event["committee_names"])) else: action = event["status"] if event["action_code"] not in ksapi.action_codes: self.warning( "unknown action code on %s: %s %s" % (bill_id, event["action_code"], event["status"])) atype = None else: atype = ksapi.action_codes[event["action_code"]] bill.add_action(action, date, chamber=actor, classification=atype) # Versions are exposed in `bill_data['versions'], # but lack any descriptive text or identifiers; # continue to scrape these from the HTML yield from self.scrape_html(bill, session) yield bill
def scrape(self, session=None): if not session: session = self.latest_session() self.info("no session specified, using %s", session) # get member id matching for vote parsing member_ids = self.get_member_ids()[session] per_page = 10 # seems like it gives 10 no matter what. start_record = 0 params = { "request": { "sEcho": 2, "iColumns": 4, "sColumns": "", "iDisplayStart": 0, "iDisplayLength": per_page, "mDataProp_0": "ShortTitle", "mDataProp_1": "Title", "mDataProp_2": "LegislationCategories", "mDataProp_3": "Modified", "iSortCol_0": 0, "sSortDir_0": "asc", "iSortingCols": 0, "bSortable_0": "true", "bSortable_1": "true", "bSortable_2": "true", "bSortable_3": "true", }, "criteria": { "Keyword": "", "Category": "", "SubCategoryId": "", "RequestOf": "", "CouncilPeriod": str(session), "Introducer": "", "CoSponsor": "", "CommitteeReferral": "", "CommitteeReferralComments": "", "StartDate": "", "EndDate": "", "QueryLimit": 100, "FilterType": "", "Phases": "", "LegislationStatus": "0", "IncludeDocumentSearch": "false", }, } param_json = json.dumps(params) response = api_request("/GetPublicAdvancedSearch", data=param_json) # the response is a terrible string-of-nested-json-strings. Yuck. response = response["d"] data = response["aaData"] while len(data) > 0: for bill in data: # sometimes they're in there more than once, so we'll keep track bill_id = bill["Title"] if bill_id.startswith("AG"): # actually an agenda, skip continue bill_params = {"legislationId": bill_id} bill_info = api_request("/GetPublicData", data=json.dumps(bill_params)) bill_info = bill_info["d"]["data"] bill_source_url = "http://lims.dccouncil.us/Legislation/" + bill_id legislation_info = bill_info["Legislation"][0] title = legislation_info["ShortTitle"] if bill_id.startswith("R") or bill_id.startswith("CER"): bill_type = "resolution" else: bill_type = "bill" bill = Bill( bill_id, legislative_session=session, title=title, classification=bill_type, ) # sponsors and cosponsors if "Introducer" in legislation_info: introducers = legislation_info["Introducer"] else: # sometimes there are introducers, sometimes not. # Set Introducers to empty array to avoid downstream breakage, # but log bills without introducers self.logger.warning("No Introducer: {0}".format( bill.identifier)) introducers = [] try: # sometimes there are cosponsors, sometimes not. cosponsors = legislation_info["CoSponsor"] except KeyError: cosponsors = [] for i in introducers: name = i["Name"] # they messed up Phil Mendelson's name if name == "Phil Pmendelson": name = "Phil Mendelson" bill.add_sponsorship( name, classification="primary", entity_type="person", primary=True, ) for s in cosponsors: name = s["Name"] if name == "Phil Pmendelson": name = "Phil Mendelson" bill.add_sponsorship( name=name, classification="cosponsor", entity_type="person", primary=False, ) # if it's become law, add the law number as an alternate title if "LawNumber" in legislation_info: law_num = legislation_info["LawNumber"] if law_num: bill.add_title(law_num) # also sometimes it's got an act number if "ActNumber" in legislation_info: act_num = legislation_info["ActNumber"] if act_num: bill.add_title(act_num) # sometimes AdditionalInformation has a previous bill name if "AdditionalInformation" in legislation_info: add_info = legislation_info["AdditionalInformation"] if "previously" in add_info.lower(): prev_title = (add_info.lower().replace( "previously", "").strip().replace(" ", "")) bill.add_title(prev_title.upper()) elif add_info: bill.extras["additional_information"] = add_info if "WithDrawnDate" in legislation_info: withdrawn_date = self.date_format( legislation_info["WithDrawnDate"]) withdrawn_by = legislation_info["WithdrawnBy"][0][ "Name"].strip() if withdrawn_by == "the Mayor": bill.add_action( "withdrawn", withdrawn_date, chamber="executive", classification="withdrawal", ) elif "committee" in withdrawn_by.lower(): a = bill.add_action("withdrawn", withdrawn_date, classification="withdrawal") a.add_related_entity(withdrawn_by, entity_type="organization") else: a = bill.add_action("withdrawn", withdrawn_date, classification="withdrawal") a.add_related_entity(withdrawn_by, entity_type="person") for action in bill_info["LegislationBillHistory"]: action_name = action["Description"] action_date = datetime.datetime.strptime( action["ActionDate"], "%Y/%m/%d %H:%M:%S") action_date = self._TZ.localize(action_date) action_class = self.classify_action(action_name) if "mayor" in action_name.lower(): actor = "executive" else: actor = "legislature" a = bill.add_action( action_name, action_date, classification=action_class, chamber=actor, ) if (action_class is not None and "referral-committee" in action_class): if "CommitteeReferral" in legislation_info: committees = [] for committee in legislation_info[ "CommitteeReferral"]: if (committee["Name"].lower() == "retained by the council"): committees = [] break else: committees.append(committee["Name"]) if committees != []: for com in committees: a.add_related_entity( com, entity_type="organization") if "CommitteeReferralComments" in legislation_info: for committee in legislation_info[ "CommitteeReferralComments"]: a.add_related_entity( committee["Name"], entity_type="organization") # deal with actions involving the mayor mayor = bill_info["MayorReview"] if mayor != []: mayor = mayor[0] if "TransmittedDate" in mayor: transmitted_date = self.date_format( mayor["TransmittedDate"]) # if returned but not signed, it was vetoed elif "ReturnedDate" in mayor: veto_date = self.date_format(mayor["ReturnedDate"]) bill.add_action( "vetoed", veto_date, chamber="executive", classification="executive-veto", ) # if it was returned and enacted but not signed, there was a veto override if "EnactedDate" in mayor: override_date = self.date_format( mayor["EnactedDate"]) bill.add_action( "veto override", override_date, classification="veto-override-passage", ) if "AttachmentPath" in mayor: # documents relating to the mayor's review self.add_documents(mayor["AttachmentPath"], bill) congress = bill_info["CongressReview"] if len(congress) > 0: congress = congress[0] if "TransmittedDate" in congress: transmitted_date = self.date_format( congress["TransmittedDate"]) bill.add_action("Transmitted to Congress for review", transmitted_date) # deal with committee actions if "DateRead" in legislation_info: date = legislation_info["DateRead"] elif "IntroductionDate" in legislation_info: date = legislation_info["IntroductionDate"] else: self.logger.warning( "we can't find anything that looks like an " "action date. Skipping") continue date = self.date_format(date) # deal with random docs floating around docs = bill_info["OtherDocuments"] for d in docs: if "AttachmentPath" in d: self.add_documents(d["AttachmentPath"], bill) else: self.logger.warning( "Document path missing from 'Other Documents'") if "MemoLink" in legislation_info: self.add_documents(legislation_info["MemoLink"], bill) if "AttachmentPath" in legislation_info: self.add_documents(legislation_info["AttachmentPath"], bill) # full council votes votes = bill_info["VotingSummary"] for vote in votes: v = self.process_vote(vote, bill, member_ids) if v: v.add_source(bill_source_url) yield v # deal with committee votes if "CommitteeMarkup" in bill_info: committee_info = bill_info["CommitteeMarkup"] if len(committee_info) > 0: for committee_action in committee_info: v = self.process_committee_vote( committee_action, bill) if v: v.add_source(bill_source_url) yield v if "AttachmentPath" in committee_info: self.add_documents(vote["AttachmentPath"], bill) bill.add_source(bill_source_url) yield bill # get next page start_record += per_page params["request"]["iDisplayStart"] = start_record param_json = json.dumps(params) response = api_request("/GetPublicAdvancedSearch", data=param_json) response = response["d"] data = response["aaData"]
def scrape(self): for leg_summary in self.legislation( created_after=datetime.datetime(2014, 1, 1)): leg_type = BILL_TYPES[leg_summary['Type']] bill = Bill(identifier=leg_summary['File\xa0#'], title=leg_summary['Title'], legislative_session=None, classification=leg_type, from_organization={"name": "New York City Council"}) bill.add_source(leg_summary['url']) leg_details = self.legDetails(leg_summary['url']) history = self.history(leg_summary['url']) bill.add_title(leg_details['Name'], note='created by administrative staff') if 'Summary' in leg_details: bill.add_abstract(leg_details['Summary'], note='') if leg_details['Law number']: bill.add_identifier(leg_details['Law number'], note='law number') for sponsorship in self._sponsors(leg_details.get('Sponsors', [])): sponsor, sponsorship_type, primary = sponsorship bill.add_sponsorship(sponsor, sponsorship_type, 'person', primary, entity_id=_make_pseudo_id(name=sponsor)) for attachment in leg_details.get('Attachments', []): bill.add_document_link(attachment['label'], attachment['url'], media_type="application/pdf") history = list(history) if history: earliest_action = min( self.toTime(action['Date']) for action in history) bill.legislative_session = self.sessions(earliest_action) else: bill.legislative_session = str(self.SESSION_STARTS[0]) for action in history: action_description = action['Action'] if not action_description: continue action_class = ACTION_CLASSIFICATION[action_description] action_date = self.toDate(action['Date']) responsible_org = action['Action\xa0By'] if responsible_org == 'City Council': responsible_org = 'New York City Council' elif responsible_org == 'Administration': responsible_org = 'Mayor' if responsible_org == 'Town Hall Meeting': continue else: act = bill.add_action( action_description, action_date, organization={'name': responsible_org}, classification=action_class) if 'url' in action['Action\xa0Details']: action_detail_url = action['Action\xa0Details']['url'] if action_class == 'committee-referral': action_details = self.actionDetails(action_detail_url) referred_committee = action_details[ 'Action text'].rsplit(' to the ', 1)[-1] act.add_related_entity( referred_committee, 'organization', entity_id=_make_pseudo_id(name=referred_committee)) result, votes = self.extractVotes(action_detail_url) if votes: action_vote = VoteEvent( legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=action_class, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes: action_vote.vote(option, voter) yield action_vote text = self.text(leg_summary['url']) if text: bill.extras = { 'local_classification': leg_summary['Type'], 'full_text': text } else: bill.extras = {'local_classification': leg_summary['Type']} yield bill
def get_bill(self, matter): '''Make Bill object from given matter.''' ''' Currently, NYC Legistar does not have conventional "Types" for three newly added committees: https://legistar.council.nyc.gov/Departments.aspx We communicated the issue to NYC, and until we learn more, we will skip the bills attached to those committees. ''' orgs_without_type = ['Charter Revision Commission 2019', 'New York City Advisory Commission on Property Tax Reform', 'Democratic Conference of the Council of the City of New York'] if matter['MatterBodyName'].strip() in orgs_without_type: return None matter_id = matter['MatterId'] if matter_id in DUPLICATED_ACTIONS: return None date = matter['MatterIntroDate'] title = matter['MatterName'] identifier = matter['MatterFile'] if not all((date, title, identifier)): return None leg_type = BILL_TYPES[matter['MatterTypeName']] bill_session = self.sessions(self.toTime(date)) bill = Bill(identifier=identifier, title=title, classification=leg_type, legislative_session=bill_session, from_organization={"name": "New York City Council"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') if matter['MatterTitle']: bill.add_title(matter['MatterTitle']) if matter['MatterEXText5']: bill.add_abstract(matter['MatterEXText5'], note='') try: for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) except KeyError: self.version_errors.append(legistar_web) return None for attachment in self.attachments(matter_id): if attachment['MatterAttachmentId'] == 103315: # Duplicate return None if attachment['MatterAttachmentName']: bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type='application/pdf') for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: return None else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') try: text = self.text(matter_id) except KeyError: self.version_errors.append(legistar_web) return None bill.extras['local_classification'] = matter['MatterTypeName'] if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'].replace(u'\u0000', '') if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') return bill