def scrape(self, session=None, chambers=None): # Bills endpoint can sometimes take a very long time to load self.timeout = 300 if not session: session = self.latest_session() self.info("no session, using %s", session) if int(session) < 128: raise AssertionError("No data for period {}".format(session)) elif int(session) < 131: # they changed their data format starting in 131st and added # an undocumented API yield from self.old_scrape(session) else: chamber_dict = { "Senate": "upper", "House": "lower", "House of Representatives": "lower", "house": "lower", "senate": "upper", } # so presumanbly not everything passes, but we haven't # seen anything not pass yet, so we'll need to wait # till it fails and get the right language in here vote_results = { "approved": True, "passed": True, "adopted": True, "true": True, "false": False, "failed": False, True: True, False: False, } action_dict = { "ref_ctte_100": "referral-committee", "intro_100": "introduction", "intro_101": "introduction", "pass_300": "passage", "intro_110": "reading-1", "refer_210": "referral-committee", "crpt_301": None, "crpt_317": None, "concur_606": "passage", "pass_301": "passage", "refer_220": "referral-committee", "intro_102": ["introduction", "passage"], "intro_105": ["introduction", "passage"], "intro_ref_ctte_100": "referral-committee", "refer_209": None, "intro_108": ["introduction", "passage"], "intro_103": ["introduction", "passage"], "msg_reso_503": "passage", "intro_107": ["introduction", "passage"], "imm_consid_360": "passage", "refer_213": None, "adopt_reso_100": "passage", "adopt_reso_110": "passage", "msg_507": "amendment-passage", "confer_713": None, "concur_603": None, "confer_712": None, "msg_506": "amendment-failure", "receive_message_100": "passage", "motion_920": None, "concur_611": None, "confer_735": None, "third_429": None, "final_501": None, "concur_608": None, "infpass_217": "passage", } base_url = "http://search-prod.lis.state.oh.us" first_page = base_url first_page += "/solarapi/v1/general_assembly_{session}/".format( session=session) legislators = self.get_legislator_ids(first_page) all_amendments = self.get_other_data_source( first_page, base_url, "amendments") all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals") all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss") all_analysis = self.get_other_data_source(first_page, base_url, "analysiss") for row in self.get_bill_rows(session): ( spacer, number_link, _ga, title, primary_sponsor, status, spacer, ) = row.xpath("td") # S.R.No.1 -> SR1 bill_id = number_link.text_content().replace("No.", "") bill_id = bill_id.replace(".", "").replace(" ", "") # put one space back in between type and number bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id) title = title.text_content().strip() title = re.sub(r"^Title", "", title) chamber = "lower" if "H" in bill_id else "upper" classification = "bill" if "B" in bill_id else "resolution" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification, ) bill.add_source(number_link.xpath("a/@href")[0]) # get bill from API bill_api_url = ( "http://search-prod.lis.state.oh.us/solarapi/v1/" "general_assembly_{}/{}/{}/".format( session, "bills" if "B" in bill_id else "resolutions", bill_id.lower().replace(" ", ""), )) data = self.get(bill_api_url).json() if len(data["items"]) == 0: self.logger.warning( "Data for bill {bill_id} has empty 'items' array," " cannot process related information".format( bill_id=bill_id.lower().replace(" ", ""))) yield bill continue # add title if no short title if not bill.title: bill.title = data["items"][0]["longtitle"] bill.add_title(data["items"][0]["longtitle"], "long title") # this stuff is version-specific for version in data["items"]: version_name = version["version"] version_link = base_url + version["pdfDownloadLink"] bill.add_version_link(version_name, version_link, media_type="application/pdf") # we'll use latest bill_version for everything else bill_version = data["items"][0] bill.add_source(bill_api_url) # subjects for subj in bill_version["subjectindexes"]: try: bill.add_subject(subj["primary"]) except KeyError: pass try: secondary_subj = subj["secondary"] except KeyError: secondary_subj = "" if secondary_subj: bill.add_subject(secondary_subj) # sponsors sponsors = bill_version["sponsors"] for sponsor in sponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification="primary", entity_type="person", primary=True, ) cosponsors = bill_version["cosponsors"] for sponsor in cosponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification="cosponsor", entity_type="person", primary=False, ) try: action_doc = self.get(base_url + bill_version["action"][0]["link"]) except scrapelib.HTTPError: pass else: actions = action_doc.json() for action in reversed(actions["items"]): actor = chamber_dict[action["chamber"]] action_desc = action["description"] try: action_type = action_dict[action["actioncode"]] except KeyError: self.warning( "Unknown action {desc} with code {code}." " Add it to the action_dict" ".".format(desc=action_desc, code=action["actioncode"])) action_type = None date = self._tz.localize( datetime.datetime.strptime(action["datetime"], "%Y-%m-%dT%H:%M:%S")) date = "{:%Y-%m-%d}".format(date) bill.add_action(action_desc, date, chamber=actor, classification=action_type) # attach documents gathered earlier self.add_document(all_amendments, bill_id, "amendment", bill, base_url) self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url) self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url) self.add_document(all_analysis, bill_id, "analysis", bill, base_url) # votes vote_url = base_url + bill_version["votes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning( "Vote page not loading; skipping: {}".format(vote_url)) yield bill continue votes = vote_doc.json() yield from self.process_vote( votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results, ) vote_url = base_url vote_url += bill_version["cmtevotes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning( "Vote page not loading; skipping: {}".format(vote_url)) yield bill continue votes = vote_doc.json() yield from self.process_vote( votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results, ) if data["items"][0]["effective_date"]: effective_date = datetime.datetime.strptime( data["items"][0]["effective_date"], "%Y-%m-%d") effective_date = self._tz.localize(effective_date) # the OH website adds an action that isn't in the action list JSON. # It looks like: # Effective 7/6/18 effective_date_oh = "{:%-m/%-d/%y}".format(effective_date) effective_action = "Effective {}".format(effective_date_oh) bill.add_action( effective_action, effective_date, chamber="executive", classification=["became-law"], ) # we have never seen a veto or a disapprove, but they seem important. # so we'll check and throw an error if we find one # life is fragile. so are our scrapers. if "veto" in bill_version: veto_url = base_url + bill_version["veto"][0]["link"] veto_json = self.get(veto_url).json() if len(veto_json["items"]) > 0: raise AssertionError("Whoa, a veto! We've never" " gotten one before." " Go write some code to deal" " with it: {}".format(veto_url)) if "disapprove" in bill_version: disapprove_url = base_url + bill_version["disapprove"][0][ "link"] disapprove_json = self.get(disapprove_url).json() if len(disapprove_json["items"]) > 0: raise AssertionError( "Whoa, a disapprove! We've never" " gotten one before." " Go write some code to deal " "with it: {}".format(disapprove_url)) yield bill
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """ Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ chamber = "lower" if chamber.lower() == "house" else chamber chamber = "upper" if chamber.lower() == "senate" else chamber # Get html and parse doc = self.lxmlize(bill_detail_url) # Check if bill hasn't been transmitted to the other chamber yet transmit_check = self.get_node( doc, '//h1[text()[contains(.,"Bills")]]/following-sibling::ul/li/text()' ) if (transmit_check is not None and "has not been transmitted" in transmit_check.strip()): self.logger.debug("Bill has not been transmitted to other chamber " "... skipping {0}".format(bill_detail_url)) return # Get the basic parts of the bill bill_id = self.get_node( doc, '//h1[contains(@class,"card-title float-left mr-4")]/text()') self.logger.debug(bill_id) bill_title_text = self.get_node( doc, '//h2[text()[contains(.,"Description")]]/following-sibling::p/text()' ) if bill_title_text is not None: bill_title = bill_title_text.strip() else: long_desc_url = self.get_node( doc, '//a[text()[contains(.,"Long Description")]]/@href') long_desc_page = self.lxmlize(long_desc_url) long_desc_text = self.get_node( long_desc_page, "//h1/" "following-sibling::p/text()") if long_desc_text is not None: bill_title = long_desc_text.strip() else: bill_title = "No title found." self.logger.warning("No title found for {}.".format(bill_id)) self.logger.debug(bill_title) bill_type = { "F": "bill", "R": "resolution", "C": "concurrent resolution" }[bill_id[1].upper()] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) # Add source bill.add_source(bill_detail_url) for subject in self._subject_mapping[bill_id]: bill.add_subject(subject) # Get companion bill. companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]' '/a[starts-with(@href, "?")]/text()') companion = self.make_bill_id( companion[0]) if len(companion) > 0 else None companion_chamber = self.chamber_from_bill(companion) if companion is not None: bill.add_companion(companion, chamber=companion_chamber) # Grab sponsors bill = self.extract_sponsors(bill, doc, chamber) # Add Actions performed on the bill. bill = self.extract_actions(bill, doc, chamber) # Get all versions of the bill. bill = self.extract_versions(bill, doc, chamber, version_list_url) yield bill
def scrape_bill(self, chamber, session, bill_id): # there will be a space in bill_id if we're doing a one-off bill scrape # convert HB 102 into H102 if " " in bill_id: bill_id = bill_id[0] + bill_id.split(" ")[-1] # if chamber comes in as House/Senate convert to lower/upper if chamber == "Senate": chamber = "upper" elif chamber == "House": chamber = "lower" bill_detail_url = ( "http://www.ncleg.net/gascripts/" "BillLookUp/BillLookUp.pl?Session=%s&BillID=%s&votesToView=all" ) % (session, bill_id) # parse the bill data page, finding the latest html text data = self.get(bill_detail_url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(bill_detail_url) title_div_txt = doc.xpath('//div[contains(@class, "h2")]/text()')[0] if "Joint Resolution" in title_div_txt: bill_type = "joint resolution" bill_id = bill_id[0] + "JR " + bill_id[1:] elif "Resolution" in title_div_txt: bill_type = "resolution" bill_id = bill_id[0] + "R " + bill_id[1:] elif "Bill" in title_div_txt: bill_type = "bill" bill_id = bill_id[0] + "B " + bill_id[1:] bill_title = doc.xpath("//main//div[@class='col-12'][1]")[0] bill_title = bill_title.text_content().strip() # For special cases where bill title is blank, a new title is created using Bill ID if not bill_title: bill_title = bill_id.replace(" ", "") bill = Bill( bill_id, legislative_session=session, title=bill_title, chamber=chamber, classification=bill_type, ) bill.add_source(bill_detail_url) # skip first PDF link (duplicate link to cur version) if chamber == "lower": link_xpath = '//a[contains(@href, "/Bills/House/PDF/")]' else: link_xpath = '//a[contains(@href, "/Bills/Senate/PDF/")]' for vlink in doc.xpath(link_xpath)[1:]: # get the name from the PDF link... version_name = vlink.text.replace("\xa0", " ") version_url = vlink.attrib["href"] media_type = "text/html" if version_url.lower().endswith(".pdf"): media_type = "application/pdf" bill.add_version_link( version_name, version_url, media_type=media_type, on_duplicate="ignore" ) # rows with a 'adopted' in the text and an amendment link, skip failed amds for row in doc.xpath( '//div[@class="card-body"]/div[contains(., "Adopted")' ' and contains(@class,"row")]//a[@title="Amendment"]' ): version_url = row.xpath("@href")[0] version_name = row.xpath("string(.)").strip() bill.add_version_link( version_name, version_url, media_type="application/pdf", on_duplicate="ignore", ) # sponsors spon_row = doc.xpath( '//div[contains(text(), "Sponsors")]/following-sibling::div' )[0] # first sponsors are primary, until we see (Primary) spon_type = "primary" spon_lines = spon_row.text_content().replace("\r\n", ";").replace("\n", ";") for leg in spon_lines.split(";"): name = leg.replace("\xa0", " ").strip() if name.startswith("(Primary)") or name.endswith("(Primary)"): name = name.replace("(Primary)", "").strip() spon_type = "cosponsor" if not name: continue bill.add_sponsorship( name, classification=spon_type, entity_type="person", primary=(spon_type == "primary"), ) # keywords kw_row = doc.xpath( '//div[contains(text(), "Keywords:")]/following-sibling::div' )[0] for subject in kw_row.text_content().split(", "): bill.add_subject(subject) # actions action_tr_xpath = ( '//h6[contains(text(), "History")]' '/ancestor::div[contains(@class, "gray-card")]' '//div[contains(@class, "card-body")]' '/div[@class="row"]' ) # skip two header rows for row in doc.xpath(action_tr_xpath): cols = row.xpath("div") act_date = cols[1].text actor = cols[3].text or "" # if text is blank, try diving in action = (cols[5].text or "").strip() or cols[5].text_content().strip() if act_date is None: search_action_date = action.split() for act in search_action_date: try: if "/" in act: # try: act_date = dt.datetime.strptime(act, "%m/%d/%Y").strftime( "%Y-%m-%d" ) except KeyError: raise Exception("No Action Date Provided") else: act_date = dt.datetime.strptime(act_date, "%m/%d/%Y").strftime( "%Y-%m-%d" ) if actor == "Senate": actor = "upper" elif actor == "House": actor = "lower" else: actor = "executive" for pattern, atype in self._action_classifiers.items(): if action.startswith(pattern): break else: atype = None if act_date is not None: bill.add_action(action, act_date, chamber=actor, classification=atype) # TODO: Fix vote scraper for row in doc.xpath("//h6[@id='vote-header']"): yield from self.scrape_votes(bill, doc) # For archived votes if session in ["1997", "1999"]: yield from self.add_archived_votes(bill, bill_id) yield bill
def scrape(self, session=None): for category in self._categories: leg_listing_url = (self._API_BASE_URL + f"BulkData/{category['categoryId']}/{session}") resp = self.post(leg_listing_url, headers=self._headers, verify=False) resp.raise_for_status() leg_listing = resp.json() for leg in leg_listing: bill = Bill( leg["legislationNumber"], legislative_session=session, title=leg["title"], classification=category["name"], ) bill.add_source(leg_listing_url) bill_url = ( f"https://lims.dccouncil.us/Legislation/{leg['legislationNumber']}" ) bill.add_source(bill_url) if leg["lawNumber"]: bill.extras["lawNumber"] = leg["lawNumber"] # Actions for hist in leg["legislationHistory"]: hist_date = datetime.datetime.strptime( hist["actionDate"], "%b %d, %Y") hist_date = self._TZ.localize(hist_date) hist_action = hist["actionDescription"] if hist_action.split()[0] in [ "OtherAmendment", "OtherMotion" ]: hist_action = hist_action[5:] hist_class = self.classify_action(hist_action) if "mayor" in hist_action.lower(): actor = "executive" else: actor = "legislature" bill.add_action(hist_action, hist_date, classification=hist_class, chamber=actor) # Documents with download links if hist["downloadURL"] and ("download" in hist["downloadURL"]): download = hist["downloadURL"] if not download.startswith("http"): download = "https://lims.dccouncil.us/" + download mimetype = ("application/pdf" if download.endswith("pdf") else None) is_version = False # figure out if it's a version from type/name possible_version_types = [ "SignedAct", "Introduction", "Enrollment", "Engrossment", ] for vt in possible_version_types: if vt.lower() in download.lower(): is_version = True doc_type = vt if "amendment" in download.lower(): doc_type = "Amendment" if is_version: bill.add_version_link( doc_type, download, media_type=mimetype, on_duplicate="ignore", ) else: bill.add_document_link( hist["actionDescription"], download, media_type=mimetype, on_duplicate="ignore", ) # Grabs Legislation details leg_details_url = ( self._API_BASE_URL + f"LegislationDetails/{leg['legislationNumber']}") details_resp = self.get(leg_details_url, headers=self._headers, verify=False) details_resp.raise_for_status() leg_details = details_resp.json() # Sponsors for i in leg_details["introducers"]: name = i["memberName"] bill.add_sponsorship( name, classification="primary", entity_type="person", primary=True, ) # Co-sponsor if leg_details["coSponsors"]: for cs in leg_details["coSponsors"]: name = i["memberName"] bill.add_sponsorship( name, classification="cosponsor", entity_type="person", primary=True, ) # Committee Hearing Doc for commHearing in leg_details["committeeHearing"]: if commHearing["hearingRecord"]: bill.add_document_link( commHearing["hearingType"], commHearing["hearingRecord"], media_type="application/pdf", on_duplicate="ignore", ) for committeeMarkup in leg_details["committeeMarkup"]: if committeeMarkup["committeeReport"]: bill.add_document_link( "Committee Markup", committeeMarkup["committeeReport"], media_type="application/pdf", on_duplicate="ignore", ) # Actions and Votes if leg_details["actions"]: # To prevent duplicate votes vote_ids = [] for act in leg_details["actions"]: action_name = act["action"] action_date = datetime.datetime.strptime( act["actionDate"][:10], "%Y-%m-%d") action_date = self._TZ.localize(action_date) if action_name.split()[0] == "Other": action_name = " ".join(action_name.split()[1:]) if "mayor" in action_name.lower(): actor = "executive" else: actor = "legislature" # Documents and Versions if act["attachment"]: mimetype = ("application/pdf" if act["attachment"].endswith("pdf") else None) is_version = False # figure out if it's a version from type/name possible_version_types = [ "SignedAct", "Introduction", "Enrollment", "Engrossment", ] for vt in possible_version_types: if vt.lower() in act["attachment"].lower(): is_version = True doc_type = vt if "amendment" in act["attachment"].lower(): doc_type = "Amendment" if is_version: bill.add_version_link( doc_type, act["attachment"], media_type=mimetype, on_duplicate="ignore", ) else: bill.add_document_link( doc_type, act["attachment"], media_type=mimetype, on_duplicate="ignore", ) # Votes if act["voteDetails"]: result = act["voteDetails"]["voteResult"] if result: status = self._vote_statuses[result.lower()] id_text = (str(leg["legislationNumber"]) + "-" + action_name + "-" + result) if id_text not in vote_ids: vote_ids.append(id_text) action_class = self.classify_action( action_name) v = VoteEvent( identifier=id_text, chamber=actor, start_date=action_date, motion_text=action_name, result=status, classification=action_class, bill=bill, ) v.add_source(leg_listing_url) yes_count = ( no_count ) = absent_count = abstain_count = other_count = 0 for leg_vote in act["voteDetails"][ "votes"]: mem_name = leg_vote["councilMember"] if leg_vote["vote"] == "Yes": yes_count += 1 v.yes(mem_name) elif leg_vote["vote"] == "No": no_count += 1 v.no(mem_name) elif leg_vote["vote"] == "Absent": absent_count += 1 v.vote("absent", mem_name) elif leg_vote["vote"] == "Recused": v.vote("abstain", mem_name) abstain_count += 1 elif leg_vote["vote"] == "Present": v.vote("other", mem_name) other_count += 1 else: # Incase anything new pops up other_count += 1 v.vote("other", mem_name) v.set_count("yes", yes_count) v.set_count("no", no_count) v.set_count("absent", absent_count) v.set_count("abstain", abstain_count) v.set_count("other", other_count) yield v yield bill
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(" ", "")) bill_page = self.get(url).text html = lxml.html.fromstring(bill_page) html.make_links_absolute( "https://legislature.idaho.gov/legislation/%s/" % session ) bill_tables = html.xpath('//table[contains(@class, "bill-table")]') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill( legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type, ) bill.add_source(url) for subject in self._subjects[bill_id.replace(" ", "")]: bill.add_subject(subject) if short_title and title.lower() != short_title.lower(): bill.add_title(short_title, "short title") # documents doc_links = html.xpath('//div[contains(@class,"insert-page")]//a') for link in doc_links: name = link.text_content().strip() href = link.get("href") if "Engrossment" in name or "Bill Text" in name or "Amendment" in name: bill.add_version_link(note=name, url=href, media_type="application/pdf") else: bill.add_document_link( note=name, url=href, media_type="application/pdf" ) def _split(string): return re.split(r"\w+[,|AND]\s+", string) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split("by") if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: if "COMMITTEE" in sponsors.upper(): bill.add_sponsorship( name=sponsors.strip(), entity_type="organization", primary=True, classification="primary", ) else: for person in _split(sponsors): person = person.strip() if person != "": bill.add_sponsorship( classification="primary", name=person, entity_type="person", primary=True, ) actor = chamber last_date = None # if a bill has passed a chamber or been 'received from' # then the next committee passage is in the opposite chamber has_moved_chambers = False for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime( date + "/" + session[0:4], "%m/%d/%Y" ).strftime("%Y-%m-%d") if action.startswith("House"): actor = "lower" elif action.startswith("Senate"): actor = "upper" # votes if "AYES" in action or "NAYS" in action: yield from self.parse_vote( actor, date, row[2], session, bill_id, chamber, url ) # bill.add_vote_event(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u"\xa0", " ").strip() atype = get_action(actor, action) if atype and "passage" in atype: has_moved_chambers = True if atype and "committee-passage" in atype and has_moved_chambers: actor = _OTHER_CHAMBERS[actor] bill.add_action(action, date, chamber=actor, classification=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if "to House" in action: actor = "lower" elif "to Senate" in action: actor = "upper" yield bill
def scrape_chamber(self, chamber, session): if int(session) < 2017: legacy = NHLegacyBillScraper(self.metadata, self.datadir) yield from legacy.scrape(chamber, session) # This throws an error because object_count isn't being properly incremented, # even though it saves fine. So fake the output_names self.output_names = ["1"] return # bill basics self.bills = {} # LSR->Bill self.bills_by_id = {} # need a second table to attach votes self.versions_by_lsr = {} # mapping of bill ID to lsr self.amendments_by_lsr = {} # pre load the mapping table of LSR -> version id self.scrape_version_ids() self.scrape_amendments() last_line = [] for line in (self.get( f"http://www.gencourt.state.nh.us/dynamicdatadump/LSRs.txt?x={self.cachebreaker}" ).content.decode("utf-8").split("\n")): line = line.split("|") if len(line) < 1: continue if len(line) < 36: if len(last_line + line[1:]) == 36: # combine two lines for processing # (skip an empty entry at beginning of second line) line = last_line + line self.warning("used bad line") else: # skip this line, maybe we'll use it later self.warning("bad line: %s" % "|".join(line)) last_line = line continue session_yr = line[0] lsr = line[1] title = line[2] body = line[3] # type_num = line[4] expanded_bill_id = line[9] bill_id = line[10] if body == body_code[chamber] and session_yr == session: if expanded_bill_id.startswith("CACR"): bill_type = "constitutional amendment" elif expanded_bill_id.startswith("PET"): bill_type = "petition" elif expanded_bill_id.startswith("AR") and bill_id.startswith( "CACR"): bill_type = "constitutional amendment" elif expanded_bill_id.startswith( "SSSB") or expanded_bill_id.startswith("SSHB"): # special session house/senate bills bill_type = "bill" else: bill_type = bill_type_map[expanded_bill_id.split(" ")[0] [1:]] if title.startswith("("): title = title.split(")", 1)[1].strip() self.bills[lsr] = Bill( legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type, ) # check to see if resolution, process versions by getting lsr off link on the bill source page if re.match(r"^.R\d+", bill_id): # ex: HR 1 is lsr=847 but version id=838 resolution_url = ( "http://www.gencourt.state.nh.us/bill_status/legacy/bs2016/bill_status.aspx?" + "lsr={}&sy={}&txtsessionyear={}".format( lsr, session, session)) resolution_page = self.get( resolution_url, allow_redirects=True).content.decode("utf-8") page = lxml.html.fromstring(resolution_page) version_href = page.xpath("//a[2]/@href")[1] true_version = re.search(r"id=(\d+)&", version_href)[1] self.versions_by_lsr[lsr] = true_version # http://www.gencourt.state.nh.us/bill_status/billText.aspx?sy=2017&id=95&txtFormat=html # or if 2022 bills # http://www.gencourt.state.nh.us/bill_status/legacy/bs2016/billText.aspx?id=1410&txtFormat=html&sy=2022 if lsr in self.versions_by_lsr: version_id = self.versions_by_lsr[lsr] version_url = ( "http://www.gencourt.state.nh.us/bill_status/legacy/bs2016/" "billText.aspx?sy={}&id={}&txtFormat=html".format( session, version_id)) pdf_version_url = ( "http://www.gencourt.state.nh.us/bill_status/legacy/bs2016/" "billText.aspx?sy={}&id={}&txtFormat=pdf&v=current". format(session, version_id)) latest_version_name = "latest version" self.bills[lsr].add_version_link( note=latest_version_name, url=version_url, media_type="text/html", ) self.bills[lsr].add_version_link( note=latest_version_name, url=pdf_version_url, media_type="application/pdf", ) # http://gencourt.state.nh.us/bill_status/billtext.aspx?sy=2017&txtFormat=amend&id=2017-0464S if lsr in self.amendments_by_lsr: amendment_id = self.amendments_by_lsr[lsr] amendment_url = ( "http://www.gencourt.state.nh.us/bill_status/legacy/bs2016/" "billText.aspx?sy={}&id={}&txtFormat=amend".format( session, amendment_id)) amendment_name = "Amendment #{}".format(amendment_id) self.bills[lsr].add_version_link( note=amendment_name, url=amendment_url, media_type="application/pdf", ) self.bills_by_id[bill_id] = self.bills[lsr] # load legislators self.legislators = {} for line in (self.get( "http://www.gencourt.state.nh.us/dynamicdatadump/legislators.txt?x={}" .format( self.cachebreaker)).content.decode("utf-8").split("\n")): if len(line) < 2: continue line = line.split("|") employee_num = line[0].replace("\ufeff", "") # first, last, middle if len(line) > 2: name = "%s %s %s" % (line[2], line[3], line[1]) else: name = "%s %s" % (line[2], line[1]) self.legislators[employee_num] = {"name": name, "seat": line[5]} # body = line[4] # sponsors for line in (self.get( f"http://www.gencourt.state.nh.us/dynamicdatadump/LsrSponsors.txt?x={self.cachebreaker}" ).content.decode("utf-8").split("\n")): if len(line) < 1: continue session_yr, lsr, _seq, employee, primary = line.strip().split("|") lsr = lsr.zfill(4) if session_yr == session and lsr in self.bills: sp_type = "primary" if primary == "1" else "cosponsor" try: # Removes extra spaces in names sponsor_name = self.legislators[employee]["name"].strip() sponsor_name = " ".join(sponsor_name.split()) self.bills[lsr].add_sponsorship( classification=sp_type, name=sponsor_name, entity_type="person", primary=True if sp_type == "primary" else False, ) self.bills[lsr].extras = { "_code": self.legislators[employee]["seat"] } except KeyError: self.warning("Error, can't find person %s" % employee) # actions for line in (self.get( f"http://www.gencourt.state.nh.us/dynamicdatadump/Docket.txt?x={self.cachebreaker}" ).content.decode("utf-8").split("\n")): if len(line) < 1: continue # a few blank/irregular lines, irritating if "|" not in line: continue (session_yr, lsr, timestamp, bill_id, body, action, _) = line.split("|") if session_yr == session and lsr in self.bills: actor = "lower" if body == "H" else "upper" time = dt.datetime.strptime(timestamp, "%m/%d/%Y %H:%M:%S %p") action = action.strip() atype = classify_action(action) self.bills[lsr].add_action( chamber=actor, description=action, date=time.strftime("%Y-%m-%d"), classification=atype, ) amendment_id = extract_amendment_id(action) if amendment_id: self.bills[lsr].add_document_link( note="amendment %s" % amendment_id, url=AMENDMENT_URL % amendment_id, on_duplicate="ignore", ) yield from self.scrape_votes(session) # save all bills for bill in self.bills: # bill.add_source(zip_url) self.add_source(self.bills[bill], bill, session) yield self.bills[bill]
def scrape_bill_list(self, url): bill_list = self._get_bill_list(url) for bill_info in bill_list: (bill_id, ) = bill_info.xpath("td[1]/font/input/@value") (sponsor, ) = bill_info.xpath("td[2]/font/input/@value") (subject, ) = bill_info.xpath("td[3]//text()") subject = subject.strip() chamber = self.CHAMBERS[bill_id[0]] if "B" in bill_id: bill_type = "bill" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" else: raise AssertionError( "Unknown bill type for bill '{}'".format(bill_id)) bill = Bill( bill_id, legislative_session=self.session, chamber=chamber, title="", classification=bill_type, ) if subject: bill.subject = [subject] if sponsor: bill.add_sponsorship( name=sponsor, entity_type="person", classification="primary", primary=True, ) bill.add_source(url) bill_url = ("http://alisondb.legislature.state.al.us/Alison/" "SESSBillStatusResult.aspx?BILL={}".format(bill_id)) bill.add_source(bill_url) bill_html = self._get_bill_response(bill_url) if bill_html is None: self.warning( "Bill {} has no webpage, and will be skipped".format( bill_id)) continue bill_doc = lxml.html.fromstring(bill_html) if bill_doc.xpath( '//span[@id="ContentPlaceHolder1_lblShotTitle"]'): title = (bill_doc.xpath( '//span[@id="ContentPlaceHolder1_lblShotTitle"]') [0].text_content().strip()) if not title: title = "[No title given by state]" bill.title = title session = "2021FS" if self.session == "2021s1" else self.session version_url_base = ( "http://alisondb.legislature.state.al.us/ALISON/" "SearchableInstruments/{0}/PrintFiles/{1}-".format( session, bill_id)) versions = bill_doc.xpath( '//table[@class="box_versions"]/tr/td[2]/font/text()') for version in versions: name = version if version == "Introduced": version_url = version_url_base + "int.pdf" elif version == "Engrossed": version_url = version_url_base + "eng.pdf" elif version == "Enrolled": version_url = version_url_base + "enr.pdf" else: raise NotImplementedError( "Unknown version type found: '{}'".format(name)) bill.add_version_link( name, version_url, media_type="application/pdf", on_duplicate="ignore", ) # Fiscal notes exist, but I can't figure out how to build their URL fiscal_notes = bill_doc.xpath( '//table[@class="box_fiscalnote"]')[1:] for fiscal_note in fiscal_notes: pass # Budget Isolation Resolutions are handled as extra actions/votes birs = bill_doc.xpath( '//div[@class="box_bir"]//table//table/tr')[1:] for bir in birs: bir_action = bir.xpath("td[1]")[0].text_content().strip() # Sometimes ALISON's database puts another bill's # actions into the BIR action list; ignore these if bill_id not in bir_action: self.warning( "BIR action found ({}) ".format(bir_action) + "that doesn't match the bill ID ({})".format(bill_id)) continue bir_date = datetime.datetime.strptime( bir.xpath("td[2]/font/text()")[0], self.DATE_FORMAT) bir_type = bir.xpath("td[1]/font/text()")[0].split(" ")[0] bir_chamber = self.CHAMBERS[bir_type[0]] bir_text = "{0}: {1}".format( bir_type, bir.xpath("td[3]/font/text()")[0].strip()) bill.add_action( bir_text, TIMEZONE.localize(bir_date), chamber=bir_chamber, classification="other", ) try: (bir_vote_id, ) = bir.xpath("td[4]/font/input/@value") except ValueError: bir_vote_id = "" bir_vote_id = bir_vote_id.strip() if bir_vote_id.startswith("Roll "): bir_vote_id = bir_vote_id.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=bir_type[0], bill_id="{0}%20for%20{1}".format(bir_type, bill_id), vote_id=bir_vote_id, vote_date=TIMEZONE.localize(bir_date), action_text=bir_text, ) actions = bill_doc.xpath( '//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:] action_date = None for action in actions: # If actions occur on the same day, only one date will exist if (action.xpath("td[1]/font/text()")[0].encode( "ascii", "ignore").strip()): action_date = datetime.datetime.strptime( action.xpath("td[1]/font/text()")[0], self.DATE_FORMAT) (action_chamber, ) = action.xpath("td[2]/font/text()") possible_amendment = action.xpath("td[3]/font/u/text()") if (len(possible_amendment) > 0 and not possible_amendment[0].strip() == ""): (amendment, ) = possible_amendment else: amendment = None (action_text, ) = action.xpath("td[4]/font/text()") action_type = _categorize_action(action_text) # check for occasional extra last row if not action_chamber.strip(): continue # The committee cell is just an abbreviation, so get its name actor = self.CHAMBERS[action_chamber] try: action_committee = (re.search( r".*? referred to the .*? committee on (.*?)$", action_text).group(1).strip()) except AttributeError: action_committee = "" if action_date is not None and action_text.strip(): act = bill.add_action( action_text, TIMEZONE.localize(action_date), chamber=actor, classification=action_type, ) if action_committee: act.add_related_entity(action_committee, entity_type="organization") try: vote_button = action.xpath("td[9]//text()")[0].strip() except IndexError: vote_button = "" if vote_button.startswith("Roll "): vote_id = vote_button.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=action_chamber, bill_id=bill_id, vote_id=vote_id, vote_date=TIMEZONE.localize(action_date), action_text=action_text, ) if amendment: session = "2021FS" if self.session == "2021s1" else self.session amend_url = ( "http://alisondb.legislature.state.al.us/ALISON/" "SearchableInstruments/{0}/PrintFiles/{1}.pdf".format( session, amendment)) amend_name = "Amd/Sub {}".format(amendment) bill.add_version_link( amend_name, amend_url, media_type="application/pdf", on_duplicate="ignore", ) yield bill
def scrape_details(self, bill_detail_url, session, chamber, bill_id): """ Create the Bill and add the information obtained from the provided bill_detail_url. and then yield the bill object. :param bill_detail_url: :param session: :param chamber: :param bill_id: :return: """ page = self.get(bill_detail_url).text if "INVALID BILL NUMBER" in page: self.warning("INVALID BILL %s" % bill_detail_url) return doc = lxml.html.fromstring(page) doc.make_links_absolute(bill_detail_url) bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0] bill_type = bill_div.xpath("span/text()")[0] if "General Bill" in bill_type: bill_type = "bill" elif "Concurrent Resolution" in bill_type: bill_type = "concurrent resolution" elif "Joint Resolution" in bill_type: bill_type = "joint resolution" elif "Resolution" in bill_type: bill_type = "resolution" else: raise ValueError("unknown bill type: %s" % bill_type) # this is fragile, but less fragile than it was b = bill_div.xpath('./b[text()="Summary:"]')[0] bill_summary = b.getnext().tail.strip() bill = Bill( bill_id, legislative_session= session, # session name metadata's `legislative_sessions` chamber=chamber, # 'upper' or 'lower' title=bill_summary, classification=bill_type, ) subjects = list(self._subjects[bill_id]) for subject in subjects: bill.add_subject(subject) # sponsors for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'): bill.add_sponsorship( name=sponsor, classification="primary", primary=True, entity_type="person", ) for sponsor in doc.xpath( '//a[contains(@href, "committee.php")]/text()'): sponsor = sponsor.replace(u"\xa0", " ").strip() bill.add_sponsorship( name=sponsor, classification="primary", primary=True, entity_type="organization", ) # find versions version_url = doc.xpath('//a[text()="View full text"]/@href')[0] version_html = self.get(version_url).text version_doc = lxml.html.fromstring(version_html) version_doc.make_links_absolute(version_url) for version in version_doc.xpath('//a[contains(@href, "/prever/")]'): # duplicate versions with same date, use first appearance bill.add_version_link( note=version. text, # Description of the version from the state; # eg, 'As introduced', 'Amended', etc. url=version.get("href"), on_duplicate="ignore", media_type="text/html", # Still a MIME type ) # actions for row in bill_div.xpath("table/tr"): date_td, chamber_td, action_td = row.xpath("td") date = datetime.datetime.strptime(date_td.text, "%m/%d/%y") action_chamber = { "Senate": "upper", "House": "lower", None: "legislature" }[chamber_td.text] action = action_td.text_content() action = action.split("(House Journal")[0] action = action.split("(Senate Journal")[0].strip() atype = action_type(action) bill.add_action( description=action, # Action description, from the state date=date.strftime("%Y-%m-%d"), # `YYYY-MM-DD` format chamber=action_chamber, # 'upper' or 'lower' classification=atype, # Options explained in the next section ) # votes vurl = doc.xpath('//a[text()="View Vote History"]/@href') if vurl: vurl = vurl[0] yield from self.scrape_vote_history(bill, vurl) bill.add_source(bill_detail_url) yield bill
def scrape(self, session=None): if not session: session = self.jurisdiction.legislative_sessions[-1]["identifier"] self.info("no session specified, using %s", session) chamber_types = { "H": "lower", "S": "upper", "G": "executive", "C": "legislature", } session_id = SESSION_SITE_IDS[session] self._url_base += session_id + "/" bill_url_base = "https://lis.virginia.gov/cgi-bin/" self.load_members() self.load_sponsors() self.load_amendments() self.load_history() self.load_summaries() self.load_votes() self.load_bills() for bill in self._bills: bill = self._bills[bill][0] bill_id = bill["bill_id"] chamber = chamber_types[bill_id[0]] bill_type = { "B": "bill", "J": "joint resolution", "R": "resolution" }[bill_id[1]] b = Bill( bill_id, session, bill["bill_description"], chamber=chamber, classification=bill_type, ) bill_url = bill_url_base + f"legp604.exe?{session_id}+sum+{bill_id}" b.add_source(bill_url) # Long Bill ID needs to have 6 characters to work with vote urls, sponsors, and summaries. # Fill in blanks with 0s long_bill_id = bill_id if len(bill_id) == 3: long_bill_id = bill_id[0:2] + "000" + bill_id[-1] elif len(bill_id) == 4: long_bill_id = bill_id[0:2] + "00" + bill_id[-2:] elif len(bill_id) == 5: long_bill_id = bill_id[0:2] + "0" + bill_id[-3:] # Sponsors for spon in self._sponsors[long_bill_id]: sponsor_type = spon["patron_type"] if sponsor_type.endswith("Chief Patron"): sponsor_type = "primary" else: sponsor_type = "cosponsor" b.add_sponsorship( spon["member_name"], classification=sponsor_type, entity_type="person", primary=sponsor_type == "primary", ) # Summary summary_texts = self._summaries[long_bill_id] for sum_text in summary_texts: b.add_abstract(sum_text["summary_text"], sum_text["summary_type"]) # Amendment docs amendments = self._amendments[bill_id] for amend in amendments: doc_link = ( bill_url_base + f"legp604.exe?{session_id}+amd+{amend['txt_docid']}") b.add_document_link("Amendment: " + amend["txt_docid"], doc_link, media_type="text/html") # Action text is used to improve version text actions_text = [] # History and then votes for hist in self._history[bill_id]: action = hist["history_description"] action_date = hist["history_date"] date = datetime.datetime.strptime(action_date, "%m/%d/%y").date() chamber = chamber_types[action[0]] vote_id = hist["history_refid"] cleaned_action = action[2:] actions_text.append(cleaned_action) # categorize actions for pattern, atype in ACTION_CLASSIFIERS: if re.match(pattern, cleaned_action): break else: atype = None if atype != SKIP: b.add_action(cleaned_action, date, chamber=chamber, classification=atype) if len(vote_id) > 0: total_yes = 0 total_no = 0 total_not_voting = 0 total_abstain = 0 for v in self._votes[vote_id]: if v["vote_result"] == "yes": total_yes += 1 elif v["vote_result"] == "no": total_no += 1 elif v["vote_result"] == "not voting": total_not_voting += 1 elif v["vote_result"] == "abstain": total_abstain += 1 vote = VoteEvent( identifier=vote_id, start_date=date, chamber=chamber, motion_text=cleaned_action, result="pass" if total_yes > total_no else "fail", classification="passage", bill=b, ) vote.set_count("yes", total_yes) vote.set_count("no", total_no) vote.set_count("not voting", total_not_voting) vote.set_count("abstain", total_abstain) vote_url = ( bill_url_base + f"legp604.exe?{session_id}+vot+{vote_id}+{long_bill_id}" ) vote.add_source(vote_url) for v in self._votes[vote_id]: vote.vote(v["vote_result"], v["member_id"]) yield vote # Versions for version in bill["text_docs"]: # Checks if abbr is blank as not every bill has multiple versions if len(version["doc_abbr"]) > 0: version_url = ( bill_url_base + f"legp604.exe?{session_id}+ful+{version['doc_abbr']}") version_date = datetime.datetime.strptime( version["doc_date"], "%m/%d/%y").date() version_text = version["doc_abbr"] for act in actions_text: if version_text in act: version_text = act b.add_version_link( version_text, version_url, date=version_date, media_type="text/html", on_duplicate="ignore", ) yield b
def test_set_bill_obj(): ve = toy_vote_event() b = Bill("HB 1", legislative_session="2009", title="fake bill") ve.set_bill(b) assert ve.bill == b._id
def scrape_bill_page(self, chamber, session, bill_url, bill_abbreviation): page = self.lxmlize(bill_url) author = self.get_one_xpath(page, "//a[@id='ctl00_PageBody_LinkAuthor']/text()") def sbp(x): return self.scrape_bare_page( page.xpath("//a[contains(text(), '%s')]" % (x))[0].attrib["href"] ) authors = [x.text for x in sbp("Authors")] try: digests = sbp("Digests") except IndexError: digests = [] try: versions = sbp("Text") except IndexError: versions = [] try: amendments = sbp("Amendments") except IndexError: amendments = [] title = page.xpath("//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0] title = title.replace("\u00a0\u00a0", " ") actions = page.xpath( "//div[@id='ctl00_PageBody_PanelBillInfo']/" "/table[@style='font-size:small']/tr" ) bill_id = page.xpath("//span[@id='ctl00_PageBody_LabelBillID']/text()")[0] bill_type = self._bill_types[bill_abbreviation[1:]] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(bill_url) authors.remove(author) bill.add_sponsorship( author, classification="primary", entity_type="person", primary=True ) for author in authors: bill.add_sponsorship( author, classification="cosponsor", entity_type="person", primary=False ) for digest in digests: bill.add_document_link( note=digest.text, url=digest.attrib["href"], media_type="application/pdf", ) for version in versions: bill.add_version_link( note=version.text, url=version.attrib["href"], media_type="application/pdf", ) for amendment in amendments: if "href" in amendment.attrib: bill.add_version_link( note=amendment.text, url=amendment.attrib["href"], media_type="application/pdf", ) flags = { "prefiled": ["filing"], "referred to the committee": ["referral-committee"], "sent to the house": ["passage"], "ordered returned to the house": ["passage"], "ordered to the senate": ["passage"], "signed by the governor": ["executive-signature"], "sent to the governor": ["executive-receipt"], "becomes Act": ["became-law"], "vetoed by the governor": ["executive-veto"], } try: votes_link = page.xpath("//a[text() = 'Votes']")[0] yield from self.scrape_votes(bill, votes_link.attrib["href"]) except IndexError: # Some bills don't have any votes pass for action in actions: date, chamber, page, text = [x.text for x in action.xpath(".//td")] session_year = self.jurisdiction.legislative_sessions[-1]["start_date"][0:4] # Session is April -> June. Prefiles look like they're in # January at earliest. date += "/{}".format(session_year) date = dt.datetime.strptime(date, "%m/%d/%Y") chamber = self._chambers[chamber] cat = [] for flag in flags: if flag in text.lower(): cat += flags[flag] bill.add_action( description=text, date=date.strftime("%Y-%m-%d"), chamber=chamber, classification=cat, ) yield bill
def test_set_bill_obj_no_extra_args(): ve = toy_vote_event() b = Bill("HB 1", legislative_session="2009", title="fake bill") with pytest.raises(ValueError): ve.set_bill(b, chamber="lower")
def scrape(self, session=None): self._bill_prefix_map = { "HB": {"type": "bill", "url_segment": "bills/house"}, "HR": {"type": "resolution", "url_segment": "resolutions/house/simple"}, "HCR": { "type": "concurrent resolution", "url_segment": "resolutions/house/concurrent", }, "HJR": { "type": "joint resolution", "url_segment": "resolutions/house/joint", }, "HC": { "type": "concurrent resolution", "url_segment": "resolutions/house/concurrent", }, "HJ": { "type": "joint resolution", "url_segment": "resolutions/house/joint", }, "SB": {"type": "bill", "url_segment": "bills/senate"}, "SR": {"type": "resolution", "url_segment": "resolutions/senate/simple"}, "SCR": { "type": "concurrent resolution", "url_segment": "resolutions/senate/concurrent", }, "SJR": { "type": "joint resolution", "url_segment": "resolutions/senate/joint", }, "SC": { "type": "concurrent resolution", "url_segment": "resolutions/senate/concurrent", }, "SJ": { "type": "joint resolution", "url_segment": "resolutions/senate/joint", }, } api_base_url = "https://api.iga.in.gov" # ah, indiana. it's really, really hard to find # pdfs in their web interface. Super easy with # the api, but a key needs to be passed # in the headers. To make these documents # viewable to the public and our scrapers, # we've put up a proxy service at this link # using our api key for pdf document access. client = ApiClient(self) r = client.get("bills", session=session) all_pages = client.unpaginate(r) for b in all_pages: bill_id = b["billName"] disp_bill_id = b["displayName"] bill_link = b["link"] api_source = api_base_url + bill_link try: bill_json = client.get("bill", session=session, bill_id=bill_id.lower()) except scrapelib.HTTPError: self.logger.warning("Bill could not be accessed. Skipping.") continue title = bill_json["description"] if title == "NoneNone": title = None # sometimes description is blank # if that's the case, we can check to see if # the latest version has a short description if not title: title = bill_json["latestVersion"]["shortDescription"] # and if that doesn't work, use the bill_id but throw a warning if not title: title = bill_id self.logger.warning("Bill is missing a title, using bill id instead.") bill_prefix = self._get_bill_id_components(bill_id)[0] original_chamber = ( "lower" if bill_json["originChamber"].lower() == "house" else "upper" ) bill_type = self._bill_prefix_map[bill_prefix]["type"] bill = Bill( disp_bill_id, legislative_session=session, chamber=original_chamber, title=title, classification=bill_type, ) bill.add_source(self._get_bill_url(session, bill_id)) bill.add_source(api_source) # sponsors for s in bill_json["authors"]: self._add_sponsor_if_not_blank(bill, s, classification="author") for s in bill_json["coauthors"]: self._add_sponsor_if_not_blank(bill, s, classification="coauthor") for s in bill_json["sponsors"]: self._add_sponsor_if_not_blank(bill, s, classification="sponsor") for s in bill_json["cosponsors"]: self._add_sponsor_if_not_blank(bill, s, classification="cosponsor") # actions action_link = bill_json["actions"]["link"] api_source = api_base_url + action_link try: actions = client.get( "bill_actions", session=session, bill_id=bill_id.lower() ) except scrapelib.HTTPError: self.logger.warning("Could not find bill actions page") actions = {"items": []} for a in actions["items"]: action_desc = a["description"] if "governor" in action_desc.lower(): action_chamber = "executive" elif a["chamber"]["name"].lower() == "house": action_chamber = "lower" else: action_chamber = "upper" date = a["date"] if not date: self.logger.warning("Action has no date, skipping") continue # convert time to pupa fuzzy time date = date.replace("T", " ") # TODO: if we update pupa to accept datetimes we can drop this line date = date.split()[0] action_type = [] d = action_desc.lower() committee = None reading = False if "first reading" in d: action_type.append("reading-1") reading = True if "second reading" in d or "reread second time" in d: action_type.append("reading-2") reading = True if "third reading" in d or "reread third time" in d: action_type.append("reading-3") if "passed" in d: action_type.append("passage") if "failed" in d: action_type.append("failure") reading = True if "adopted" in d and reading: action_type.append("passage") if ( "referred" in d and "committee on" in d or "reassigned" in d and "committee on" in d ): committee = d.split("committee on")[-1].strip() action_type.append("referral-committee") if "committee report" in d: if "pass" in d: action_type.append("committee-passage") if "fail" in d: action_type.append("committee-failure") if "amendment" in d and "without amendment" not in d: if "pass" in d or "prevail" in d or "adopted" in d: action_type.append("amendment-passage") if "fail" or "out of order" in d: action_type.append("amendment-failure") if "withdraw" in d: action_type.append("amendment-withdrawal") if "signed by the governor" in d: action_type.append("executive-signature") if "vetoed by the governor" in d: action_type.append("executive-veto") if len(action_type) == 0: # calling it other and moving on with a warning self.logger.warning( "Could not recognize an action in '{}'".format(action_desc) ) action_type = None a = bill.add_action( chamber=action_chamber, description=action_desc, date=date, classification=action_type, ) if committee: a.add_related_entity(committee, entity_type="organization") # subjects subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]] for subject in subjects: bill.add_subject(subject) # Abstract if bill_json["latestVersion"]["digest"]: bill.add_abstract(bill_json["latestVersion"]["digest"], note="Digest") # put this behind a flag 2021-03-18 (openstates/issues#291) if not SCRAPE_WEB_VERSIONS: # votes yield from self._process_votes( bill_json["latestVersion"]["rollcalls"], disp_bill_id, original_chamber, session, ) # versions self.deal_with_version( bill_json["latestVersion"], bill, bill_id, original_chamber, session ) for version in bill_json["versions"][::-1]: self.deal_with_version( version, bill, bill_id, original_chamber, session, ) else: self.scrape_web_versions(session, bill, bill_id) yield bill
def old_scrape(self, session=None): status_report_url = ( "https://www.legislature.ohio.gov/legislation/status-reports") # ssl verification off due Ohio not correctly implementing SSL if not session: session = self.latest_session() self.info("no session, using %s", session) doc = self.get(status_report_url).text doc = lxml.html.fromstring(doc) doc.make_links_absolute(status_report_url) xpath = "//div[contains(text(),'{}')]/following-sibling::table" status_table = doc.xpath(xpath.format(session))[0] status_links = status_table.xpath( ".//a[contains(text(),'Excel')]/@href") for url in status_links: try: fname, resp = self.urlretrieve(url) except scrapelib.HTTPError as report: self.logger.warning("Missing report {}".format(report)) continue sh = xlrd.open_workbook(fname).sheet_by_index(0) # once workbook is open, we can remove tempfile os.remove(fname) for rownum in range(1, sh.nrows): bill_id = sh.cell(rownum, 0).value bill_type = "resolution" if "R" in bill_id else "bill" chamber = "lower" if "H" in bill_id else "upper" bill_title = str(sh.cell(rownum, 3).value) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) bill.add_source(url) bill.add_sponsor("primary", str(sh.cell(rownum, 1).value)) # add cosponsor if sh.cell(rownum, 2).value: bill.add_sponsor("cosponsor", str(sh.cell(rownum, 2).value)) actor = "" # Actions start column after bill title for colnum in range(4, sh.ncols - 1): action = str(sh.cell(0, colnum).value) cell = sh.cell(rownum, colnum) date = cell.value if len(action) != 0: if action.split()[0] == "House": actor = "lower" elif action.split()[0] == "Senate": actor = "upper" elif action.split()[-1] == "Governor": actor = "executive" elif action.split()[0] == "Gov.": actor = "executive" elif action.split()[-1] == "Gov.": actor = "executive" if action in ("House Intro. Date", "Senate Intro. Date"): atype = ["bill:introduced"] action = action.replace("Intro. Date", "Introduced") elif action == "3rd Consideration": atype = ["bill:reading:3", "bill:passed"] elif action == "Sent to Gov.": atype = ["governor:received"] elif action == "Signed By Governor": atype = ["governor:signed"] else: atype = ["other"] if type(date) == float: date = str(xlrd.xldate_as_tuple(date, 0)) date = datetime.datetime.strptime( date, "(%Y, %m, %d, %H, %M, %S)") date = self._tz.localize(date) date = "{:%Y-%m-%d}".format(date) bill.add_action(actor, action, date, type=atype) for idx, char in enumerate(bill_id): try: int(char) except ValueError: continue underscore_bill = bill_id[:idx] + "_" + bill_id[idx:] break yield from self.scrape_votes_old(bill, underscore_bill, session) self.scrape_versions_old(bill, underscore_bill, session) yield bill
def scrape_bill(self, chamber, session, url): html = self.get(url).content page = lxml.html.fromstring(html) page.make_links_absolute(self.BASE_URL) if page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()'): bill_id = page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()')[ 0 ].strip() elif page.xpath('//h2[@style="font-size:1.3rem;"]/text()'): bill_id = page.xpath('//h2[@style="font-size:1.3rem;"]/text()')[0].strip() else: self.warning("No bill id for {}".format(url)) return title = page.xpath( '//dt[contains(text(), "Title")]/following-sibling::dd[1]/text()' )[0].strip() if "B" in bill_id: _type = ["bill"] elif "J" in bill_id: _type = ["joint resolution"] elif "HS" in bill_id or "SS" in bill_id: _type = ["resolution"] else: raise ValueError("unknown bill type " + bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=_type, ) bill.add_source(url) self.scrape_bill_subjects(bill, page) self.scrape_bill_sponsors(bill, page) self.scrape_bill_actions(bill, page) # fiscal note if page.xpath('//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a'): fiscal_note = page.xpath( '//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a' )[0] fiscal_url = fiscal_note.get("href") fiscal_title = fiscal_note.text_content() bill.add_document_link( fiscal_title, fiscal_url, media_type="application/pdf" ) # effective date, where available if page.xpath('//div[contains(text(), "Effective Date(s)")]'): eff_date = page.xpath( '//div[contains(text(), "Effective Date(s)")]/text()' )[0].strip() eff_date = eff_date.replace("Effective Date(s):", "").strip() # this can contain multiple dates, eg "July 1, 2020, July 1, 2022" bill.extras["date_effective"] = eff_date # yield from self.parse_bill_votes_new(doc, bill) yield bill
def scrape_bill( self, session, chamber, bill_id, title, url, strip_sponsors=re.compile(r"\s*\(.{,50}\)\s*").sub, ): html = self.get(url).text page = lxml.html.fromstring(html) page.make_links_absolute(url) bill_type = self.bill_types[bill_id.split()[0][1:]] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(url) xpath = '//strong[contains(., "SUBJECT")]/../' "following-sibling::td/a/text()" bill.subject = page.xpath(xpath) for version in self.scrape_versions(session, chamber, page, bill_id): bill.add_version_link(**version) self.scrape_amendments(page, bill) # Resolution pages have different html. values = {} trs = page.xpath('//div[@id="bhistcontent"]/table/tr') for tr in trs: heading = tr.xpath("td/strong/text()") if heading: heading = heading[0] else: continue value = tr.text_content().replace(heading, "").strip() values[heading] = value # summary was always same as title # bill['summary'] = values['SUMMARY:'] # Add primary sponsor. primary = strip_sponsors("", values.get("LEAD SPONSOR:", "")) if primary: bill.add_sponsorship( name=primary, classification="primary", entity_type="person", primary=True, ) # Add cosponsors. if values.get("SPONSORS:"): sponsors = strip_sponsors("", values["SPONSORS:"]) sponsors = re.split(r", (?![A-Z]\.)", sponsors) for name in sponsors: name = name.strip(", \n\r") if name: # Fix name splitting bug where "Neale, D. Hall" match = re.search(r"(.+?), ([DM]\. Hall)", name) if match: for name in match.groups(): bill.add_sponsorship( name=name, classification="cosponsor", entity_type="person", primary=False, ) else: bill.add_sponsorship( name=name, classification="cosponsor", entity_type="person", primary=False, ) for link in page.xpath("//a[contains(@href, 'votes/house')]"): yield from self.scrape_house_vote(bill, link.attrib["href"]) for tr in reversed( page.xpath("//table[@class='tabborder']/descendant::tr")[1:]): tds = tr.xpath("td") if len(tds) < 3: continue chamber_letter = tds[0].text_content() chamber = {"S": "upper", "H": "lower"}[chamber_letter] # Index of date info no longer varies on resolutions. date = tds[2].text_content().strip() date = datetime.datetime.strptime(date, "%m/%d/%y").date() action = tds[1].text_content().strip() if action.lower().startswith("passed senate"): for href in tds[1].xpath("a/@href"): yield from self.scrape_senate_vote(bill, href, date) attrs = dict(chamber=chamber, description=action, date=date.strftime("%Y-%m-%d")) temp = self.categorizer.categorize(action) related_entities = [] for key, values in temp.items(): if key != "classification": for value in values: related_entities.append({"type": key, "name": value}) attrs.update(classification=temp["classification"], related_entities=related_entities) bill.add_action(**attrs) yield bill
def scrape(self, session=None): HTML_TAGS_RE = r"<.*?>" if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) # Load all bills and resolutions via the private API bills_url = "http://legislature.vermont.gov/bill/loadBillsReleased/{}/".format( year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)["data"] or [] bills_url = "http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/".format( year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)["data"] or []) resolutions_url = "http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both".format( year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)["data"] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Identify the bill type and chamber if info["BillNumber"].startswith("J.R.H."): bill_type = "joint resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("J.R.S."): bill_type = "joint resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("H.C.R."): bill_type = "concurrent resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("S.C.R."): bill_type = "concurrent resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("H.R."): bill_type = "resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("S.R."): bill_type = "resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("PR."): bill_type = "constitutional amendment" if info["Body"] == "H": bill_chamber = "lower" elif info["Body"] == "S": bill_chamber = "upper" else: raise AssertionError("Amendment not tied to chamber") elif info["BillNumber"].startswith("H."): bill_type = "bill" bill_chamber = "lower" elif info["BillNumber"].startswith("S."): bill_type = "bill" bill_chamber = "upper" else: raise AssertionError("Unknown bill type found: '{}'".format( info["BillNumber"])) bill_id = info["BillNumber"].replace(".", "").replace(" ", "") # put one space back in between type and number bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id) # Create the bill using its basic information bill = Bill( identifier=bill_id, legislative_session=session, chamber=bill_chamber, title=info["Title"], classification=bill_type, ) if "resolution" in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = "http://legislature.vermont.gov/bill/status/{0}/{1}".format( year_slug, info["BillNumber"]) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' "following-sibling::dd[1]/ul/li") sponsor_type = "primary" for sponsor in sponsors: if sponsor.xpath("span/text()") == ["Additional Sponsors"]: sponsor_type = "cosponsor" continue sponsor_name = (sponsor.xpath("a/text()")[0].replace( "Rep.", "").replace("Sen.", "").strip()) if sponsor_name and not (sponsor_name[:5] == "Less" and len(sponsor_name) == 5): bill.add_sponsorship( name=sponsor_name, classification=sponsor_type, entity_type="person", primary=(sponsor_type == "primary"), ) # Capture bill text versions # Warning: There's a TODO in VT's source code saying 'move this to where it used to be' # so leave in the old and new positions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' "following-sibling::dd[1]/ul/li/a |" '//ul[@class="bill-path"]//a') for version in versions: if version.xpath("text()"): bill.add_version_link( note=version.xpath("text()")[0], url=version.xpath("@href")[0].replace(" ", "%20"), media_type="application/pdf", ) # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/.+?/(\d+)"', lxml.etree.tostring(doc).decode("utf-8"), ).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".format( info["BillNumber"])) yield bill continue # Capture actions actions_url = "http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}".format( year_slug, internal_bill_id) actions_json = self.get(actions_url) # Checks if page actually has json posted if "json" in actions_json.headers.get("Content-Type"): actions = json.loads(actions_json.text)["data"] # Checks to see if any data is actually there if actions == "": continue else: continue bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v for k, v in action.items() if v is not None} if "Signed by Governor" in action["FullStatus"]: actor = "executive" elif action["ChamberCode"] == "H": actor = "lower" elif action["ChamberCode"] == "S": actor = "upper" else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action["FullStatus"]: # assert chambers_passed == set("HS") action_type = "executive-signature" elif "Vetoed by the Governor" in action["FullStatus"]: action_type = "executive-veto" elif ("Read first time" in action["FullStatus"] or "Read 1st time" in action["FullStatus"]): action_type = "introduction" elif "Reported favorably" in action["FullStatus"]: action_type = "committee-passage-favorable" elif actor == "lower" and any( x.lower().startswith("aspassed") for x in action["keywords"].split(";")): action_type = "passage" chambers_passed.add("H") elif actor == "upper" and any( x.lower().startswith(" aspassed") or x.lower().startswith("aspassed") for x in action["keywords"].split(";")): action_type = "passage" chambers_passed.add("S") else: action_type = None # Manual fix for data error in # https://legislature.vermont.gov/bill/status/2020/H.511 action["StatusDate"] = action["StatusDate"].replace( "/0209", "/2019") bill.add_action( description=re.sub(HTML_TAGS_RE, "", action["FullStatus"]), date=datetime.datetime.strftime( datetime.datetime.strptime(action["StatusDate"], "%m/%d/%Y"), "%Y-%m-%d", ), chamber=actor, classification=action_type, ) # Capture votes votes_url = "http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}".format( year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)["data"] bill.add_source(votes_url) for vote in votes: roll_call_id = vote["VoteHeaderID"] roll_call_url = ("http://legislature.vermont.gov/bill/" "loadBillRollCallDetails/{0}/{1}".format( year_slug, roll_call_id)) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)["data"] roll_call_yea = [] roll_call_nay = [] roll_call_not_voting = [] for member in roll_call: (member_name, _district) = member["MemberName"].split(" of ") member_name = member_name.strip() if member["MemberVote"] == "Yea": roll_call_yea.append(member_name) elif member["MemberVote"] == "Nay": roll_call_nay.append(member_name) else: roll_call_not_voting.append(member_name) if ("Passed -- " in vote["FullStatus"] # seems like we've seen both or "Governor overridden" in vote["FullStatus"] or "Governor overriden" in vote["FullStatus"]): did_pass = True elif ("Failed -- " in vote["FullStatus"] or "Veto of the Governor sustained" in vote["FullStatus"]): did_pass = False else: raise AssertionError("Roll call vote result is unclear: " + vote["FullStatus"]) # Check vote counts yea_count = int( re.search(r"Yeas = (\d+)", vote["FullStatus"]).group(1)) nay_count = int( re.search(r"Nays = (\d+)", vote["FullStatus"]).group(1)) vote_start_date = datetime.datetime.strftime( datetime.datetime.strptime(vote["StatusDate"], "%m/%d/%Y"), "%Y-%m-%d", ) motion_text = re.sub(HTML_TAGS_RE, "", vote["FullStatus"]).strip() vote_identifer = (vote["StatusDate"] + "--" + motion_text + "--" + roll_call_url) vote_to_add = VoteEvent( identifier=vote_identifer, bill=bill, chamber=("lower" if vote["ChamberCode"] == "H" else "upper"), start_date=vote_start_date, motion_text=motion_text, result="pass" if did_pass else "fail", classification="passage", legislative_session=session, ) vote_to_add.add_source(roll_call_url) vote_to_add.set_count("yes", yea_count) vote_to_add.set_count("no", nay_count) vote_to_add.set_count("not voting", len(roll_call_not_voting)) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_not_voting: vote_to_add.vote("not voting", member) yield vote_to_add # Capture extra information- Not yet implemented # Witnesses: # http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} # Conference committee members: # http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} # Committee meetings: # http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} yield bill
def scrape_chamber(self, chamber, session): chamber_letter = "S" if chamber == "upper" else "H" bill_type_map = { "B": "bill", "CR": "concurrent resolution", "JM": "joint memorial", "JR": "joint resolution", "M": "memorial", "R": "resolution", } # used for faking sources session_year = session[2:] self._init_mdb(session) # read in sponsor & subject mappings sponsor_map = {} for sponsor in self.access_to_csv("tblSponsors"): sponsor_map[sponsor["SponsorCode"]] = sponsor["FullName"] # McSorley resigned so they removed him from the API # but he is still attached to some bills # Gonzales switched from being in the House to the Senate # but was still showing as a sponsor sponsor_map["SMCSO"] = "Cisco McSorley" sponsor_map["SGONZ"] = "Roberto J. Gonzales" subject_map = {} for subject in self.access_to_csv("TblSubjects"): subject_map[subject["SubjectCode"]] = subject["Subject"] # get all bills into this dict, fill in action/docs before saving bills = {} for data in [ row for row in self.access_to_csv("Legislation") if row["BillID"].startswith(chamber_letter) ]: # use their BillID for the key but build our own for storage bill_key = data["BillID"].replace(" ", "") # remove spaces for consistency bill_id = "{}{}{}".format(data["Chamber"], data["LegType"], data["LegNo"]).replace(" ", "") bill_type = bill_type_map[data["LegType"]] bills[bill_key] = bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=data["Title"], classification=bill_type, ) # fake a source data["SessionYear"] = session_year data.update({ x: data[x].strip() for x in ["Chamber", "LegType", "LegNo", "SessionYear"] }) bill.add_source( "http://www.nmlegis.gov/Legislation/Legislation?chamber=" "{Chamber}&legType={LegType}&legNo={LegNo}" "&year={SessionYear}".format(**data)) bill.add_sponsorship( sponsor_map[data["SponsorCode"]], classification="primary", entity_type="person", primary=True, ) for sponsor_code in [ "SponsorCode2", "SponsorCode3", "SponsorCode4", "SponsorCode5", ]: if data[sponsor_code] and data[sponsor_code] not in ("NONE", "X", ""): bill.add_sponsorship( sponsor_map[data[sponsor_code]], classification="primary", entity_type="person", primary=True, ) # maybe use data['emergency'] data['passed'] data['signed'] as well for subject_code in [ "SubjectCode1", "SubjectCode2", "SubjectCode3" ]: if data[subject_code]: bill.add_subject(subject_map[data[subject_code]]) # bills and actions come from other tables self.scrape_actions(chamber_letter, bills) self.scrape_documents(session, "bills", chamber, bills) self.scrape_documents(session, "resolutions", chamber, bills) self.scrape_documents(session, "memorials", chamber, bills) self.check_other_documents(session, chamber, bills) yield from bills.values()
def scrape_bill(self, chamber, session, bill_id): # try and get bill for the first year of the session biennium url = "http://legislature.mi.gov/doc.aspx?%s-%s" % ( session[:4], bill_id.replace(" ", "-"), ) html = self.get(url).text # Otherwise, try second year of the session biennium if ( "Page Not Found" in html or "The bill you are looking for is not available yet" in html ): url = "http://legislature.mi.gov/doc.aspx?%s-%s" % ( session[-4:], bill_id.replace(" ", "-"), ) html = self.get(url).text if ( "Page Not Found" in html or "The bill you are looking for is not available yet" in html ): self.warning("Cannot open bill page for {}; skipping".format(bill_id)) return doc = lxml.html.fromstring(html) doc.make_links_absolute("http://legislature.mi.gov") title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[ 0 ].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(" ")[0][1:]] bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type) bill.add_source(url) # sponsors sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a') for sponsor in sponsors: name = sponsor.text.replace(u"\xa0", " ") # sometimes district gets added as a link if name.isnumeric(): continue if len(sponsors) > 1: classification = ( "primary" if sponsor.tail and "primary" in sponsor.tail else "cosponsor" ) else: classification = "primary" bill.add_sponsorship( name=name.strip(), chamber=chamber, entity_type="person", primary=classification == "primary", classification=classification, ) bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath("td") # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() try: date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%Y")) except ValueError: self.warning( "{} has action with invalid date. Skipping Action".format(bill_id) ) continue # instead of trusting upper/lower case, use journal for actor actor = "upper" if "SJ" in journal else "lower" classification = categorize_action(action) bill.add_action(action, date, chamber=actor, classification=classification) # check if action mentions a sub submatch = re.search( r"WITH SUBSTITUTE\s+([\w\-\d]+)", action, re.IGNORECASE ) if submatch and tds[2].xpath("a"): version_url = tds[2].xpath("a/@href")[0] version_name = tds[2].xpath("a/text()")[0].strip() version_name = "Substitute {}".format(version_name) self.info("Found Substitute {}".format(version_url)) if version_url.lower().endswith(".pdf"): mimetype = "application/pdf" elif version_url.lower().endswith(".htm"): mimetype = "text/html" bill.add_version_link(version_name, version_url, media_type=mimetype) # check if action mentions a vote rcmatch = re.search(r"Roll Call # (\d+)", action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath("a/@href") if journal_link: objectname = journal_link[0].rsplit("=", 1)[-1] chamber_name = {"upper": "Senate", "lower": "House"}[actor] vote_url = BASE_URL + "/documents/%s/Journal/%s/htm/%s.htm" % ( session, chamber_name, objectname, ) results = self.parse_roll_call(vote_url, rc_num, session) if results is not None: vote_passed = len(results["yes"]) > len(results["no"]) vote = VoteEvent( start_date=date, chamber=actor, bill=bill, motion_text=action, result="pass" if vote_passed else "fail", classification="passage", ) # check the expected counts vs actual count = re.search(r"YEAS (\d+)", action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results["yes"]): self.warning( "vote count mismatch for %s %s, %d != %d" % (bill_id, action, count, len(results["yes"])) ) count = re.search(r"NAYS (\d+)", action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results["no"]): self.warning( "vote count mismatch for %s %s, %d != %d" % (bill_id, action, count, len(results["no"])) ) vote.set_count("yes", len(results["yes"])) vote.set_count("no", len(results["no"])) vote.set_count("other", len(results["other"])) possible_vote_results = ["yes", "no", "other"] for pvr in possible_vote_results: for name in results[pvr]: if session == "2017-2018": names = name.split("\t") for n in names: vote.vote(pvr, name.strip()) else: # Prevents voter names like "House Bill No. 4451, entitled" and other sentences if len(name.split()) < 5: vote.vote(pvr, name.strip()) vote.add_source(vote_url) yield vote else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'): parsed = self.parse_doc_row(row) if parsed: name, url = parsed if url.endswith(".pdf"): mimetype = "application/pdf" elif url.endswith(".htm"): mimetype = "text/html" bill.add_version_link(name, url, media_type=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) yield bill
def scrape_bill(self, row, session): bill_id = row["LegislationDisplayCode"] amendment = None substitute = None if bill_id.count(" ") > 1: if " w/ " in bill_id: self.info("Found amended bill `{}`".format(bill_id)) bill_id, amendment = bill_id.split(" w/ ") if " -" in bill_id: self.info("Found amended bill `{}`".format(bill_id)) bill_id, amendment = bill_id.split(" -") # A bill can _both_ be amended and be substituted if " for " in bill_id: self.info( "Found substitute to use instead: `{}`".format(bill_id)) substitute, bill_id = bill_id.split(" for ") if amendment is None and substitute is None: raise ValueError("unknown bill_id format: " + bill_id) bill_type = self.classify_bill(bill_id) chamber = "upper" if bill_id.startswith("S") else "lower" bill = Bill( identifier=bill_id, legislative_session=session, chamber=chamber, title=row["LongTitle"], classification=bill_type, ) if row["Synopsis"]: bill.add_abstract(row["Synopsis"], "synopsis") if row["ShortTitle"]: bill.add_title(row["ShortTitle"], "short title") if row["SponsorPersonId"]: self.add_sponsor_by_legislator_id(bill, row["SponsorPersonId"], "primary") if substitute: bill.extras["substitute"] = substitute if amendment: bill.extras["amendment"] = amendment # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API? html_url = "https://legis.delaware.gov/BillDetail?LegislationId={}".format( row["LegislationId"]) bill.add_source(html_url, note="text/html") html = self.lxmlize(html_url) additional_sponsors = html.xpath( '//label[text()="Additional Sponsor(s):"]' "/following-sibling::div/a/@href") for sponsor_url in additional_sponsors: sponsor_id = sponsor_url.replace( "https://legis.delaware.gov/LegislatorDetail?" "personId=", "") self.add_sponsor_by_legislator_id(bill, sponsor_id, "primary") cosponsors = html.xpath('//label[text()="Co-Sponsor(s):"]/' "following-sibling::div/a/@href") for sponsor_url in cosponsors: sponsor_id = sponsor_url.replace( "https://legis.delaware.gov/LegislatorDetail?" "personId=", "") self.add_sponsor_by_legislator_id(bill, sponsor_id, "cosponsor") versions = html.xpath( '//label[text()="Original Text:"]/following-sibling::div/a/@href') for version_url in versions: media_type = self.mime_from_link(version_url) version_name = "Bill Text" bill.add_version_link(version_name, version_url, media_type=media_type) fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href') for fiscal in fiscals: self.scrape_fiscal_note(bill, fiscal) self.scrape_actions(bill, row["LegislationId"]) if row["HasAmendments"] is True: self.scrape_amendments(bill, row["LegislationId"]) yield from self.scrape_votes(bill, row["LegislationId"], session) yield bill
def scrape_chamber(self, chamber, session): # Pull the session metadata so we can get the # slug for the API Request meta = next(each for each in self.jurisdiction.legislative_sessions if each["identifier"] == session) if meta["classification"] == "special": list_slug = self.special_slugs[session] else: list_slug = "li" list_url = "http://www.kslegislature.org/{}" "/api/v11/rev-1/bill_status" list_url = list_url.format(list_slug) chamber_name = "Senate" if chamber == "upper" else "House" chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? bill_request = self.get(list_url).text bill_request_json = json.loads(bill_request) bills = bill_request_json["content"] # there are duplicates seen_ids = set() for bill_data in bills: bill_id = bill_data["BILLNO"] # filter other chambers if not bill_id.startswith(chamber_letter): continue # filter duplicates if bill_id in seen_ids: continue seen_ids.add(bill_id) if "CR" in bill_id: btype = "concurrent resolution" elif "R" in bill_id: btype = "resolution" elif "B" in bill_id: btype = "bill" title = bill_data["SHORTTITLE"] or bill_data["LONGTITLE"] # main bill = Bill(bill_id, session, title, chamber=chamber, classification=btype) bill.extras = {"status": bill_data["STATUS"]} bill.add_source(ksapi.url + "bill_status/" + bill_id.lower()) if bill_data["LONGTITLE"] and bill_data["LONGTITLE"] != bill.title: bill.add_title(bill_data["LONGTITLE"]) # An "original sponsor" is the API's expression of "primary sponsor" for primary_sponsor in bill_data["ORIGINAL_SPONSOR"]: primary_sponsor = self.clean_sponsor_name(primary_sponsor) bill.add_sponsorship( name=primary_sponsor, entity_type="organization" if "committee" in primary_sponsor.lower() else "person", primary=True, classification="original sponsor", ) for sponsor in bill_data["SPONSOR_NAMES"]: if sponsor in bill_data["ORIGINAL_SPONSOR"]: continue sponsor = self.clean_sponsor_name(sponsor) bill.add_sponsorship( name=sponsor, entity_type="organization" if "committee" in sponsor.lower() else "person", primary=False, classification="cosponsor", ) # history is backwards for event in reversed(bill_data["HISTORY"]): actor = "upper" if event["chamber"] == "Senate" else "lower" date = event["session_date"] # append committee names if present if "committee_names" in event: action = (event["status"] + " " + " and ".join(event["committee_names"])) else: action = event["status"] if event["action_code"] not in ksapi.action_codes: self.warning( "unknown action code on %s: %s %s" % (bill_id, event["action_code"], event["status"])) atype = None else: atype = ksapi.action_codes[event["action_code"]] bill.add_action(action, date, chamber=actor, classification=atype) # Versions are exposed in `bill_data['versions'], # but lack any descriptive text or identifiers; # continue to scrape these from the HTML yield from self.scrape_html(bill, session) yield bill
def scrape_bill(self, chamber, session, session_id, bill_id, url): sidebar = lxml.html.fromstring(self.get(url).text) sidebar.make_links_absolute("https://www.legis.iowa.gov") hist_url = ( f"https://www.legis.iowa.gov/legislation/billTracking/" f"billHistory?billName={bill_id}&ga={session_id}" ) req_session = requests.Session() req = requests.get(hist_url) if req.status_code == 500: self.warning("500 error on {}, skipping".format(hist_url)) return page = lxml.html.fromstring(req.text) page.make_links_absolute("https://www.legis.iowa.gov") title = page.xpath( 'string(//div[@id="content"]/div[@class=' '"divideVert"]/div/div[4]/div[2])' ).strip() if title == "": # Sometimes the title is moved, see # https://www.legis.iowa.gov/legislation/billTracking/billHistory?billName=SF%20139&ga=88 title = page.xpath( 'string(//div[@id="content"]/div[@class=' '"divideVert"]/div[4]/div[2])' ).strip() if title == "": self.warning("URL: %s gives us an *EMPTY* bill. Aborting." % url) return if title.lower().startswith("in"): title = page.xpath("string(//table[2]/tr[3])").strip() if "HR" in bill_id or "SR" in bill_id: bill_type = ["resolution"] elif "HJR" in bill_id or "SJR" in bill_id: bill_type = ["joint resolution"] elif "HCR" in bill_id or "SCR" in bill_id: bill_type = ["concurrent resolution"] else: bill_type = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(hist_url) # base url for text version (version_abbrev, session_id, bill_id) version_html_url_template = ( "https://www.legis.iowa.gov/docs/" "publications/LG{}/{}/attachments/{}.html" ) version_pdf_url_template = ( "https://www.legis.iowa.gov/docs/" "publications/LG{}/{}/{}.pdf" ) # get pieces of version_link vpieces = sidebar.xpath('//select[@id="billVersions"]/option') if vpieces: for version in vpieces: version_name = version.text version_abbrev = version.xpath("string(@value)") # Get HTML document of bill version. version_html_url = version_html_url_template.format( version_abbrev.upper(), session_id, bill_id.replace(" ", "") ) bill.add_version_link( note=version_name, url=version_html_url, media_type="text/html" ) # Get PDF document of bill version. version_pdf_url = version_pdf_url_template.format( version_abbrev.upper(), session_id, bill_id.replace(" ", "") ) if "Marked Up" in version_name: version_pdf_url = sidebar.xpath( "//iframe[@id='bbContextDoc']/@src" )[0] bill.add_version_link( note=version_name, url=version_pdf_url, media_type="application/pdf" ) sponsors_str = page.xpath( 'string(//div[@id="content"]/div[@class=' '"divideVert"]/div/div[4]/div[1])' ).strip() if re.search("^By ", sponsors_str): sponsors = re.split(",| and ", sponsors_str.split("By ")[1]) # for some bills sponsors listed in different format else: sponsors = re.findall( r"[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)", sponsors_str ) for sponsor in sponsors: sponsor = sponsor.replace(" and", "").strip(" .,") # a few sponsors get mangled by our regex sponsor = { "Means": "Ways & Means", "Iowa": "Economic Growth/Rebuild Iowa", "Safety": "Public Safety", "Resources": "Human Resources", "Affairs": "Veterans Affairs", "Protection": "Environmental Protection", "Government": "State Government", "Boef": "De Boef", }.get(sponsor, sponsor) if sponsor[0].islower(): # SSBs catch cruft in it ('charges', 'overpayments') # https://sunlight.atlassian.net/browse/DATA-286 continue bill.add_sponsorship( name=sponsor, classification="primary", entity_type="person", primary=True, ) for tr in page.xpath( "//table[contains(@class, 'billActionTable')][1]/tbody/tr" ): date = tr.xpath("string(td[contains(text(), ', 20')])").strip() if date.startswith("***"): continue elif "No history is recorded at this time." in date: return if date == "": continue date = datetime.datetime.strptime(date, "%B %d, %Y").date() action = tr.xpath("string(td[3])").strip() action = re.sub(r"\s+", " ", action) # Capture any amendment links. links = [link for link in [version["links"] for version in bill.versions]] version_urls = [link["url"] for link in [i for sub in links for i in sub]] if "amendment" in action.lower(): for anchor in tr.xpath(".//a[1]"): if "-" in anchor.text: # https://www.legis.iowa.gov/docs/publications/AMDI/88/S3071.pdf amd_pattern = "https://www.legis.iowa.gov/docs/publications/AMDI/{}/{}.pdf" amd_id = anchor.text.replace("-", "").strip() amd_url = amd_pattern.format(session_id, amd_id) amd_name = "Amendment {}".format(anchor.text.strip()) if amd_url not in version_urls: bill.add_version_link( note=amd_name, url=amd_url, media_type="application/pdf" ) version_urls.append(amd_url) else: self.info("Already Added {}, skipping".format(amd_url)) if "S.J." in action or "SCS" in action: actor = "upper" elif "H.J." in action or "HCS" in action: actor = "lower" else: actor = "legislature" action = re.sub(r"(H|S)\.J\.\s+\d+\.$", "", action).strip() if action.startswith("Introduced"): atype = ["introduction"] if ", referred to" in action: atype.append("referral-committee") elif action.startswith("Read first time"): atype = "reading-1" elif action.startswith("Referred to"): atype = "referral-committee" elif action.startswith("Sent to Governor"): atype = "executive-receipt" elif action.startswith("Reported Signed by Governor"): atype = "executive-signature" elif action.startswith("Signed by Governor"): atype = "executive-signature" elif action.startswith("Vetoed by Governor"): atype = "executive-veto" elif action.startswith("Item veto"): atype = "executive-veto-line-item" elif re.match(r"Passed (House|Senate)", action): atype = "passage" elif re.match(r"Amendment (S|H)-\d+ filed", action): atype = ["amendment-introduction"] if ", adopted" in action: atype.append("amendment-passage") elif re.match(r"Amendment (S|H)-\d+( as amended,)? adopted", action): atype = "amendment-passage" elif re.match(r"Amendment (S|N)-\d+ lost", action): atype = "amendment-failure" elif action.startswith("Resolution filed"): atype = "introduction" elif action.startswith("Resolution adopted"): atype = "passage" elif action.startswith("Committee report") and action.endswith("passage."): atype = "committee-passage" elif action.startswith("Withdrawn"): atype = "withdrawal" else: atype = None if action.strip() == "": continue if re.search(r"END OF \d+ ACTIONS", action): continue if "$history" not in action: bill.add_action( description=action, date=date, chamber=actor, classification=atype ) self.scrape_subjects(bill, bill_id, session, req_session) yield bill
def scrape_bill(self, session, chamber, bill_type, url): bill_html = self.get(url).text bill_page = lxml.html.fromstring(bill_html) qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query)) bill_id = "{}{}".format(qs["billtype"], qs["billnumber"]) versions = bill_page.xpath( "//table[contains(@id, 'GridViewVersions')]")[0] metainf_table = bill_page.xpath( '//div[contains(@id, "itemPlaceholder")]//table[1]')[0] action_table = bill_page.xpath( '//div[contains(@id, "UpdatePanel1")]//table[1]')[0] meta = self.parse_bill_metainf_table(metainf_table) subs = [s.strip() for s in meta["Report Title"].split(";")] if "" in subs: subs.remove("") b = Bill( bill_id, session, meta["Measure Title"], chamber=chamber, classification=bill_type, ) if meta["Description"]: b.add_abstract(meta["Description"], "description") for subject in subs: b.add_subject(subject) if url: b.add_source(url) prior_session = "{} Regular Session".format(str(int(session[:4]) - 1)) companion = meta["Companion"].strip() if companion: b.add_related_bill( identifier=companion.replace(u"\xa0", " "), legislative_session=prior_session, relation_type="companion", ) if bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()" ): prior = bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()" )[-1] if "carried over" in prior.lower(): b.add_related_bill( identifier=bill_id.replace(u"\xa0", " "), legislative_session=prior_session, relation_type="companion", ) for sponsor in meta["Introducer(s)"]: if "(Introduced by request of another party)" in sponsor: sponsor = sponsor.replace( " (Introduced by request of another party)", "") if sponsor != "": b.add_sponsorship(sponsor, "primary", "person", True) if "gm" in bill_id.lower(): b.add_sponsorship("governor", "primary", "person", True) self.parse_bill_versions_table(b, versions) self.parse_testimony(b, bill_page) self.parse_cmte_reports(b, bill_page) yield from self.parse_bill_actions_table(b, action_table, bill_id, session, url, chamber) yield b
def scrape_archive_bills(self, session): session_abr = session[0:2] url = f"https://www.ilga.gov/legislation/legisnet{session_abr}/{session_abr}gatoc.html" html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) bill_numbers_sections = doc.xpath("//table//a/@href") # Contains multiple bills for bill_numbers_section_url in bill_numbers_sections: bill_section_html = self.get(bill_numbers_section_url).text bill_section_doc = lxml.html.fromstring(bill_section_html) bill_section_doc.make_links_absolute(bill_numbers_section_url) if "/sb" in bill_numbers_section_url or "/sr" in bill_numbers_section_url: chamber = "upper" else: chamber = "lower" bills_urls = bill_section_doc.xpath("//blockquote/a/@href") # Actual Bill Pages for bill_url in bills_urls: bill_html = self.get(bill_url).text bill_doc = lxml.html.fromstring(bill_html) bill_doc.make_links_absolute(bill_url) sponsors = bill_doc.xpath('//pre/a[contains(@href, "sponsor")]') bill_id = bill_doc.xpath('//font[contains (., "Status of")]') if len(bill_id) < 1: bill_id = bill_doc.xpath('//font[contains (., "Summary of")]') bill_id = bill_id[0].text_content().split()[-1] if "JRCA" in bill_id: classification = "constitutional amendment" elif "JR" in bill_id: classification = "joint resolution" elif "R" in bill_id: classification = "resolution" else: classification = "bill" if "status" in bill_url: # Currently on status page, but need info for summary page summary_page_url = bill_doc.xpath( '//a[contains (., "Bill Summary")]/@href' )[0] summary_page_html = self.get(summary_page_url).text summary_page_doc = lxml.html.fromstring(summary_page_html) summary_page_doc.make_links_absolute(summary_page_url) else: # Currently on summary page, but need info for status page summary_page_doc = bill_doc summary_page_url = bill_url bill_url = bill_doc.xpath('//a[contains (., "Bill Status")]/@href')[ 0 ] bill_html = self.get(bill_url).text bill_doc = lxml.html.fromstring(bill_html) bill_doc.make_links_absolute(bill_url) summary_text = ( summary_page_doc.xpath("//pre")[0].text_content().splitlines() ) for x in range(len(summary_text)): line = summary_text[x] if "Short description:" in line: bill_title = summary_text[x + 1] bill = Bill( bill_id, legislative_session=session, title=bill_title, chamber=chamber, classification=classification, ) bill.add_source(summary_page_url) bill.add_source(url) # Sponsors for sponsor in sponsors: if sponsor.text_content(): bill.add_sponsorship( name=sponsor.text_content(), classification="cosponsor", entity_type="person", primary=False, ) # Bill version version_url = bill_doc.xpath('//a[contains (., "Full Text")]/@href')[0] bill.add_version_link(bill_id, version_url, media_type="text/html") # Actions bill_text = bill_doc.xpath("//pre") if bill_text: bill_text = bill_text[0].text_content().splitlines() for x in range(len(bill_text)): line = bill_text[x].split() # Regex is looking for this format: JAN-11-2001 or 99-02-17 if line and ( re.match(r"\D\D\D-\d\d-\d\d\d\d", line[0]) or re.match(r"\d\d-\d\d-\d\d", line[0]) ): if session in ["91st", "90th"]: action_date = datetime.datetime.strptime( line[0], "%y-%m-%d" ) else: action_date = datetime.datetime.strptime( line[0], "%b-%d-%Y" ) action_date = central.localize(action_date) action_date = action_date.isoformat() action = " ".join(line[2:]) if line[1] == "S": action_chamber = "upper" else: action_chamber = "lower" for pattern, atype in _archived_action_classifiers.items(): if action.startswith(pattern): break else: atype = None bill.add_action( action, action_date, chamber=action_chamber, classification=atype, ) yield bill
def scrape_bill(self, chamber, session, bill_id, url): page = self.lxmlize(url) (header, ) = page.xpath('//h3[@class="heading"]/text()') title = header.replace(bill_id, "").strip() if ".B. " in bill_id: bill_type = "bill" elif bill_id.startswith("H.R. ") or bill_id.startswith("S.R. "): bill_type = "resolution" elif ".C.R. " in bill_id: bill_type = "concurrent resolution" elif ".J.R. " in bill_id: bill_type = "joint resolution" for flag in SUB_BLACKLIST: if flag in bill_id: bill_id = bill_id.replace(flag, " ") bill_id = re.sub(r"\s+", " ", bill_id).strip() bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(url) primary_info = page.xpath('//div[@id="billsponsordiv"]') for info in primary_info: try: (title, name) = [ x.strip() for x in info.xpath(".//text()") if x.strip() ] except ValueError: self.warning( "Could not find sponsor's name for {}".format(bill_id)) continue assert title == "Bill Sponsor:" name = name.replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship(name, classification="primary", entity_type="person", primary=True) floor_info = page.xpath('//div[@id="floorsponsordiv"]//text()') floor_info = [x.strip() for x in floor_info if x.strip()] if len(floor_info) in (0, 1): # This indicates that no floor sponsor was found pass elif len(floor_info) == 2: assert floor_info[0] == "Floor Sponsor:" floor_sponsor = floor_info[1].replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship( floor_sponsor, classification="cosponsor", entity_type="person", primary=False, ) else: raise AssertionError("Unexpected floor sponsor HTML found") versions = page.xpath( '//b[text()="Bill Text"]/following-sibling::ul/li/' 'a[text() and not(text()=" ")]') for version in versions: # sometimes the href is on the following <a> tag and the tag we # have has an onclick url = version.get("href") if not url: url = version.xpath("following-sibling::a[1]/@href")[0] bill.add_version_link(version.xpath("text()")[0].strip(), url, media_type="application/pdf") for related in page.xpath( '//b[text()="Related Documents "]/following-sibling::ul/li/' 'a[contains(@class,"nlink")]'): href = related.xpath("@href")[0] if ".fn.pdf" in href: bill.add_document_link("Fiscal Note", href, media_type="application/pdf") else: text = related.xpath("text()")[0] bill.add_document_link(text, href, media_type="application/pdf") subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill.subject = subjects if page.xpath('//div[@id="billStatus"]//table'): status_table = page.xpath('//div[@id="billStatus"]//table')[0] yield from self.parse_status(bill, status_table, chamber) yield bill
def scrape_bill(self, chamber, session, doc_type, url, bill_type=None): try: html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) except scrapelib.HTTPError as e: assert ( "500" in e.args[0] ), "Unexpected error when accessing page: {}".format(e) self.warning("500 error for bill page; skipping bill") return # bill id, title, summary bill_num = re.findall(r"DocNum=(\d+)", url)[0] bill_type = bill_type or DOC_TYPES[doc_type[1:]] bill_id = doc_type + bill_num title = doc.xpath( '//span[text()="Short Description:"]/following-sibling::span[1]/' "text()" )[0].strip() # 1. Find the heading with "Synopsis As Introduced" for text. # 2. Go to the next heading. # 3. Backtrack and grab everything to, but not including, #1. # 4. Grab text of all, including nested, nodes. summary_nodes = doc.xpath( '//span[text()="Synopsis As Introduced"]/following-sibling::span[contains(@class, "heading2")]/' 'preceding-sibling::*[preceding-sibling::span[text()="Synopsis As Introduced"]]//' "text()" ) summary = "\n".join([node.strip() for node in summary_nodes]) bill = Bill( identifier=bill_id, legislative_session=session, title=title, classification=bill_type, chamber=chamber, ) bill.add_abstract(summary, note="") bill.add_source(url) # sponsors sponsor_list = build_sponsor_list(doc.xpath('//a[contains(@class, "content")]')) # don't add just yet; we can make them better using action data committee_actors = {} # actions action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td') for date, actor, action_elem in group(action_tds, 3): date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") date = self.localize(date).date() actor = actor.text_content() if actor == "House": actor_id = {"classification": "lower"} elif actor == "Senate": actor_id = {"classification": "upper"} action = action_elem.text_content() classification, related_orgs = _categorize_action(action) # if related_orgs and any(c.startswith("committee") for c in classification): # ((name, source),) = [ # (a.text, a.get("href")) # for a in action_elem.xpath("a") # if "committee" in a.get("href") # ] # source = canonicalize_url(source) # actor_id = {"sources__url": source, "classification": "committee"} # committee_actors[source] = name bill.add_action( action, date, organization=actor_id, classification=classification, related_entities=related_orgs, ) if action.lower().find("sponsor") != -1: self.refine_sponsor_list(actor, action, sponsor_list, bill_id) # now add sponsors for spontype, sponsor, chamber, official_type in sponsor_list: if official_type == "primary": primary = True else: primary = False if chamber: bill.add_sponsorship( sponsor, spontype, "person", primary=primary, chamber=chamber ) else: bill.add_sponsorship(spontype, sponsor, "person", primary=primary) # versions version_url = doc.xpath('//a[text()="Full Text"]/@href')[0] self.scrape_documents(bill, version_url) yield bill votes_url = doc.xpath('//a[text()="Votes"]/@href')[0] yield from self.scrape_votes(session, bill, votes_url, committee_actors)
def scrape_bill_type( self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex(), ): bills = (self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr)) archive_year = int(session[0:4]) not_archive_year = archive_year >= 2009 for bill in bills: bill_session = session if bill.session_num != "0": bill_session += " Special Session %s" % bill.session_num bill_id = bill.short_bill_id if bill_id.strip() == "SB77" and session == "20052006": continue fsbill = Bill(bill_id, bill_session, title="", chamber=chamber) if (bill_id.startswith("S") and chamber == "lower") or (bill_id.startswith("A") and chamber == "upper"): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # Construct a fake source url source_url = ("http://leginfo.legislature.ca.gov/faces/" "billNavClient.xhtml?bill_id=%s") % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type="text/html") title = "" type_ = ["bill"] subject = "" all_titles = set() summary = "" # Get digest test (aka "summary") from latest version. if bill.versions and not_archive_year: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = "//caml:DigestText/xhtml:p" els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r"\s+", " ", t) t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t) chunks.append(t) summary = "\n\n".join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime("%m/%d/%y") version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type="application/pdf", date=version_date.date(), ) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ("AB", "SB"): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len( version.short_title) and not version.title.lower( ).startswith("an act"): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == "Yes": type_.append("appropriation") tags = [] if version.fiscal_committee == "Yes": tags.append("fiscal committee") if version.local_program == "Yes": tags.append("local program") if version.urgency == "Yes": tags.append("urgency") if version.taxlevy == "Yes": tags.append("tax levy") if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note="summary") fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras["impact_clause"] = impact_clause fsbill.extras["tags"] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == "Y", entity_type="person", ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r"(Assembly|Senate)($| \(Floor)", actor) if match: actor = { "Assembly": "lower", "Senate": "upper" }[match.group(1)] elif actor.startswith("Governor"): actor = "executive" else: def replacer(matchobj): if matchobj: return { "Assembly": "lower", "Senate": "upper" }[matchobj.group()] else: return matchobj.group() actor = re.sub(r"^(Assembly|Senate)", replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r"\s+", " ", act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r"Com[s]?. on", action.action) and not matched_abbrs: msg = "Failed to extract committee abbr from %r." self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ("Mapping contains no committee name for " "abbreviation %r. Action text was %r.") args = (abbr, action.action) self.warning(msg % args) committees = filter(None, committees) kwargs["committees"] = committees code = re.search(r"C[SXZ]\d+", actor) if code is not None: code = code.group() kwargs["actor_info"] = {"committee_code": code} if not_archive_year: assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace("Coms. on ", "") act_str = act_str.replace("Com. on " + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith("."): act_str = act_str + "." # Determine which chamber the action originated from. changed = False for committee_chamber in ["upper", "lower", "legislature"]: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = "legislature" if actor != action.actor: actor_info = kwargs.get("actor_info", {}) actor_info["details"] = action.actor kwargs["actor_info"] = actor_info # Add strings for related legislators, if any. rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+" legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs["legislators"] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime("%Y-%m-%d"), chamber=actor, classification=kwargs["classification"], ) for committee in kwargs.get("committees", []): action.add_related_entity(committee, entity_type="organization") seen_actions.add((actor, act_str, date)) source_url = ( "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?" ) source_url += f"bill_id={session}{bill.session_num}{fsbill.identifier}" # Votes for non archived years if archive_year > 2009: for vote_num, vote in enumerate(bill.votes): if vote.vote_result == "(PASS)": result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(" ")[0].lower() if first_part in ["asm", "assembly"]: vote_chamber = "lower" # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith("sen"): vote_chamber = "upper" # vote_location = ' '.join(full_loc.split(' ')[1:]) else: # raise ScrapeError("Bad location: %s" % full_loc) # To uncomment continue if vote.motion: motion = vote.motion.motion_text or "" else: motion = "" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = [] motion = motion.strip() motion = re.compile(r"(\w+)( Extraordinary)? Session$", re.IGNORECASE).sub("", motion) motion = re.compile(r"^(Senate|Assembly) ", re.IGNORECASE).sub("", motion) motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ", "", motion) motion = re.sub(r" \(\w+\)$", "", motion) motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "", motion) motion = re.sub( r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? " r"Urgency Clause$", "(Urgency Clause)", motion, ) motion = re.sub(r"\s+", " ", motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result="pass" if result else "fail", classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {"threshold": vote.threshold} fsvote.add_source(source_url) fsvote.pupa_id = source_url + "#" + str(vote_num) rc = {"yes": [], "no": [], "other": []} for record in vote.votes: if record.vote_code == "AYE": rc["yes"].append(record.legislator_name) elif record.vote_code.startswith("NO"): rc["no"].append(record.legislator_name) else: rc["other"].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote if len(bill.votes) > 0 and archive_year <= 2009: vote_page_url = ( "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?" ) vote_page_url += ( f"bill_id={session}{bill.session_num}{fsbill.identifier}") # parse the bill data page, finding the latest html text data = self.get(vote_page_url).content doc = html.fromstring(data) doc.make_links_absolute(vote_page_url) num_of_votes = len(doc.xpath("//div[@class='status']")) for vote_section in range(1, num_of_votes + 1): lines = doc.xpath( f"//div[@class='status'][{vote_section}]//div[@class='statusRow']" ) date, result, motion, vtype, location = "", "", "", "", "" votes = {} for line in lines: line = line.text_content().split() if line[0] == "Date": date = line[1] date = datetime.datetime.strptime(date, "%m/%d/%y") date = self._tz.localize(date) elif line[0] == "Result": result = "pass" if "PASS" in line[1] else "fail" elif line[0] == "Motion": motion = " ".join(line[1:]) elif line[0] == "Location": location = " ".join(line[1:]) elif len(line) > 1: if line[0] == "Ayes" and line[1] != "Count": votes["yes"] = line[1:] elif line[0] == "Noes" and line[1] != "Count": votes["no"] = line[1:] elif line[0] == "NVR" and line[1] != "Count": votes["not voting"] = line[1:] # Determine chamber based on location first_part = location.split(" ")[0].lower() vote_chamber = "" if first_part in ["asm", "assembly"]: vote_chamber = "lower" elif first_part.startswith("sen"): vote_chamber = "upper" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" if len(motion) > 0: fsvote = VoteEvent( motion_text=motion, start_date=date, result=result, classification=vtype, chamber=vote_chamber, bill=fsbill, ) fsvote.add_source(vote_page_url) fsvote.pupa_id = vote_page_url + "#" + str( vote_section) for how_voted, voters in votes.items(): for voter in voters: voter = voter.replace(",", "") fsvote.vote(how_voted, voter) yield fsvote yield fsbill self.session.expire_all()
def scrape_bill(self, chamber, session, bill_id, title, url): page = self.get(url).json() api_id = page["BillId"] if re.match(r"^(S|H)B ", bill_id): btype = ["bill"] elif re.match(r"(S|H)C ", bill_id): btype = ["commemoration"] elif re.match(r"(S|H)JR ", bill_id): btype = ["joint resolution"] elif re.match(r"(S|H)CR ", bill_id): btype = ["concurrent resolution"] else: btype = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=btype, ) bill.add_source(url) version_rows = page["Documents"] assert len(version_rows) > 0 for version in version_rows: date = version["DocumentDate"] if date: match = re.match(r"\d{4}-\d{2}-\d{2}", date) date = datetime.datetime.strptime(match.group(0), "%Y-%m-%d").date() html_link = f"https://sdlegislature.gov/Session/Bill/{api_id}/{version['DocumentId']}" pdf_link = f"https://mylrc.sdlegislature.gov/api/Documents/{version['DocumentId']}.pdf" note = version["BillVersion"] bill.add_version_link( note, html_link, date=date, media_type="text/html", on_duplicate="ignore", ) bill.add_version_link( note, pdf_link, date=date, media_type="application/pdf", on_duplicate="ignore", ) else: self.warning("Version listed but no date or documents") sponsors = page["BillSponsor"] if sponsors: for sponsor in sponsors: sponsor_type = "person" member = sponsor["Member"] # first and last name are available, but UniqueName is the old link text # could change later? bill.add_sponsorship( member["UniqueName"], classification="primary", primary=True, entity_type=sponsor_type, ) else: sponsor_type = "organization" committee_sponsor = re.search(r">(.*)</a>", page["BillCommitteeSponsor"])[1] bill.add_sponsorship( committee_sponsor, classification="primary", primary=True, entity_type=sponsor_type, ) for keyword in page["Keywords"]: bill.add_subject(keyword["Keyword"]["Keyword"]) actions_url = f"https://sdlegislature.gov/api/Bills/ActionLog/{api_id}" yield from self.scrape_action(bill, actions_url, chamber) yield bill
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.get(url).text) except scrapelib.HTTPError as e: self.warning("error (%s) fetching %s, skipping" % (e, url)) return title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if not title: self.warning("blank bill on %s - skipping", url) return if "JR" in bill_id: bill_type = ["joint resolution"] elif "CR" in bill_id: bill_type = ["concurrent resolution"] elif "R" in bill_id: bill_type = ["resolution"] else: bill_type = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(url) bill.subject = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if "author not found" in name.lower(): continue if ":" in name: raise Exception(name) if "otherAuth" in link.attrib["id"]: bill.add_sponsorship( name, classification="cosponsor", entity_type="person", primary=False, ) else: bill.add_sponsorship(name, classification="primary", entity_type="person", primary=True) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == "None": continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == "H": actor = "lower" elif actor == "S": actor = "upper" attrs = self.categorizer.categorize(action) related_entities = [] for item in attrs["committees"]: related_entities.append({"type": "committee", "name": item}) for item in attrs["legislators"]: related_entities.append({"type": "legislator", "name": item}) bill.add_action( description=action, date=date.strftime("%Y-%m-%d"), chamber=actor, classification=attrs["classification"], related_entities=related_entities, ) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] # Keep track of already seen versions to prevent processing duplicates. version_urls = [] for link in version_table.xpath(".//a[contains(@href, '.PDF')]"): version_url = link.attrib["href"] if version_url in version_urls: self.warning("Skipping duplicate version URL.") continue else: version_urls.append(version_url) if link.text is None: self.warning("Skipping unnamed version.") continue name = link.text.strip() if re.search("COMMITTEE REPORTS|SCHEDULED CCR", version_url, re.IGNORECASE): bill.add_document_link(note=name, url=version_url, media_type="application/pdf") continue bill.add_version_link(note=name, url=version_url, media_type="application/pdf") self.scrape_amendments(bill, page) for link in page.xpath(".//a[contains(@href, '_VOTES')]"): if "HT_" not in link.attrib["href"]: yield from self.scrape_votes( bill, self.urlescape(link.attrib["href"])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_title = bill.title == "Short Title Not Found." if has_no_title: # If there's no title, this is an empty page. Skip! return else: # Otherwise, save the bills. yield bill
def parse_bill(self, chamber, session, bill_id, url): try: page = self.lxmlize(url) except scrapelib.HTTPError as e: self.logger.warning(e) return if self.parse_bill_field(page, "Last Action") != "": last_action = self.parse_bill_field(page, "Last Action").xpath("text()")[0] if "WITHDRAWN" in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) return title = self.parse_bill_field(page, "Title").text_content() if "CR" in bill_id: bill_type = "concurrent resolution" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" else: bill_type = "bill" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.subject = self._subjects[bill_id] bill.add_source(url) version_ct = self.parse_versions(page, bill) if version_ct < 1: # Bill withdrawn self.logger.warning("Bill withdrawn.") return self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) self.parse_proposed_amendments(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib["href"] mimetype = get_media_type(source_url) bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) for link in page.xpath("//td/span/a[contains(@href, 'Legislator-Profile')]"): bill.add_sponsorship( link.text.strip(), classification="primary", entity_type="person", primary=True, ) if page.xpath("//th[contains(text(),'Votes')]"): vote_url = page.xpath("//a[contains(text(),'Vote History')]/@href")[0] yield from self.scrape_votes(vote_url, bill, chamber) bdr_no = self.parse_bill_field(page, "Bill Request Number") if bdr_no != "" and bdr_no.xpath("text()"): bdr = bdr_no.xpath("text()")[0].strip() bill.extras["BDR"] = bdr yield bill