def scrape_bill(self, session, session_slug, chamber, url): page = lxml.html.fromstring(self.get(url).text) bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip() # state bill id internal_id = re.search(r"\/Bill\/(\d+)\/Overview", url).group(1) # bill data gets filled in from another call bill_data_base = ( "https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/" "FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}") bill_data_url = bill_data_base.format(session_slug, internal_id, time.time() * 1000) bill_page = lxml.html.fromstring(self.get(bill_data_url).text) short_title = self.get_header_field(bill_page, "Summary:").text short_title = short_title.replace("\u00a0", " ") bill = Bill( identifier=bill_no, legislative_session=session, title=short_title, chamber=chamber, ) long_title = self.get_header_field(bill_page, "Title:").text if long_title is not None: bill.add_abstract(long_title, "Summary") sponsor_div = self.get_header_field(bill_page, "Primary Sponsor") if sponsor_div is not None: self.add_sponsors(sponsor_div, bill, "primary") cosponsor_div = self.get_header_field(bill_page, "Co-Sponsor") if cosponsor_div is not None: self.add_sponsors(cosponsor_div, bill, "cosponsor") self.add_actions(bill_page, bill, chamber) self.add_versions(session_slug, internal_id, bill) bill.subject = list(set(self.subject_mapping[bill_no])) bdr = self.extract_bdr(short_title) if bdr: bill.extras["BDR"] = bdr bill.extras["NV_ID"] = internal_id bill.add_source(url) yield bill
def scrape_bill(self, session, chamber, bill_url): try: page = self.lxmlize("{}{}".format(CO_URL_BASE, bill_url)) except scrapelib.HTTPError as e: if e.response.status_code == 503: self.error("Skipping %s w/ 503", bill_url) return else: raise bill_number = page.xpath( '//div[contains(@class,"field-name-field-bill-number")]' '//div[contains(@class,"field-item even")][1]/text()' )[0].strip() bill_title = page.xpath('//span[@property="dc:title"]/@content')[0] bill_summary = page.xpath( 'string(//div[contains(@class,"field-name-field-bill-summary")])' ) bill_summary = bill_summary.replace("Read More", "").strip() bill = Bill( bill_number, legislative_session=session, chamber=chamber, title=bill_title ) if bill_summary: bill.add_abstract(bill_summary, "summary") bill.add_source("{}{}".format(CO_URL_BASE, bill_url)) self.scrape_sponsors(bill, page) self.scrape_actions(bill, page) self.scrape_versions(bill, page) self.scrape_research_notes(bill, page) self.scrape_fiscal_notes(bill, page) self.scrape_committee_report(bill, page) self.scrape_amendments(bill, page) yield bill yield from self.scrape_votes(session, bill, page)
def scrape(self, session=None): if not session: session = self.jurisdiction.legislative_sessions[-1]["identifier"] self.info("no session specified, using %s", session) chamber_types = { "H": "lower", "S": "upper", "G": "executive", "C": "legislature", } # pull the current session's details to tell if it's a special session_details = next( each for each in self.jurisdiction.legislative_sessions if each["identifier"] == session) is_special = False if ("classification" in session_details and session_details["classification"] == "special"): is_special = True session_id = SESSION_SITE_IDS[session] self.init_sftp(session_id) bill_url_base = "https://lis.virginia.gov/cgi-bin/" if not is_special: self.load_members() self.load_sponsors() self.load_fiscal_notes() self.load_summaries() self.load_history() self.load_votes() self.load_bills() if not is_special: self.load_amendments() for bill in self._bills: bill = self._bills[bill][0] bill_id = bill["bill_id"] chamber = chamber_types[bill_id[0]] bill_type = { "B": "bill", "J": "joint resolution", "R": "resolution" }[bill_id[1]] b = Bill( bill_id, session, bill["bill_description"], chamber=chamber, classification=bill_type, ) bill_url = bill_url_base + f"legp604.exe?{session_id}+sum+{bill_id}" b.add_source(bill_url) # Long Bill ID needs to have 6 characters to work with vote urls, sponsors, and summaries. # Fill in blanks with 0s long_bill_id = bill_id if len(bill_id) == 3: long_bill_id = bill_id[0:2] + "000" + bill_id[-1] elif len(bill_id) == 4: long_bill_id = bill_id[0:2] + "00" + bill_id[-2:] elif len(bill_id) == 5: long_bill_id = bill_id[0:2] + "0" + bill_id[-3:] # Sponsors if long_bill_id not in self._sponsors: if "patron_name" in bill and bill["patron_name"].strip() != "": b.add_sponsorship( bill["patron_name"], classification="primary", entity_type="person", primary=True, ) for spon in self._sponsors[long_bill_id]: if spon["member_name"].strip() == "": continue sponsor_type = spon["patron_type"] if sponsor_type.endswith("Chief Patron"): sponsor_type = "primary" else: sponsor_type = "cosponsor" b.add_sponsorship( spon["member_name"], classification=sponsor_type, entity_type="person", primary=sponsor_type == "primary", ) # Summary summary_texts = self._summaries[long_bill_id] for sum_text in summary_texts: b.add_abstract(sum_text["summary_text"], sum_text["summary_type"]) # Amendment docs amendments = self._amendments[bill_id] for amend in amendments: doc_link = ( bill_url_base + f"legp604.exe?{session_id}+amd+{amend['txt_docid']}") b.add_document_link("Amendment: " + amend["txt_docid"], doc_link, media_type="text/html") # fiscal notes for fn in self._fiscal_notes[long_bill_id]: doc_link = bill_url_base + f"legp604.exe?{session_id}+oth+{fn['refid']}" b.add_document_link( "Fiscal Impact Statement: " + fn["refid"], doc_link.replace(".PDF", "+PDF"), media_type="application/pdf", ) # actions with 8-digit number followed by D are version titles too doc_actions = defaultdict(list) # History and then votes for hist in self._history[bill_id]: action = hist["history_description"] action_date = hist["history_date"] date = datetime.datetime.strptime(action_date, "%m/%d/%y").date() chamber = chamber_types[action[0]] vote_id = hist["history_refid"] cleaned_action = action[2:] if re.findall(r"\d{8}D", cleaned_action): doc_actions[action_date].append(cleaned_action) # categorize actions for pattern, atype in ACTION_CLASSIFIERS: if re.match(pattern, cleaned_action): break else: atype = None if atype != SKIP: b.add_action(cleaned_action, date, chamber=chamber, classification=atype) if len(vote_id) > 0: total_yes = 0 total_no = 0 total_not_voting = 0 total_abstain = 0 for v in self._votes[vote_id]: if v["vote_result"] == "yes": total_yes += 1 elif v["vote_result"] == "no": total_no += 1 elif v["vote_result"] == "not voting": total_not_voting += 1 elif v["vote_result"] == "abstain": total_abstain += 1 vote = VoteEvent( identifier=vote_id, start_date=date, chamber=chamber, motion_text=cleaned_action, result="pass" if total_yes > total_no else "fail", classification="passage", bill=b, ) vote.set_count("yes", total_yes) vote.set_count("no", total_no) vote.set_count("not voting", total_not_voting) vote.set_count("abstain", total_abstain) vote_url = ( bill_url_base + f"legp604.exe?{session_id}+vot+{vote_id}+{long_bill_id}" ) vote.add_source(vote_url) for v in self._votes[vote_id]: vote.vote(v["vote_result"], v["member_id"]) yield vote # Versions for version in bill["text_docs"]: # Checks if abbr is blank as not every bill has multiple versions if version["doc_abbr"]: version_url = ( bill_url_base + f"legp604.exe?{session_id}+ful+{version['doc_abbr']}") version_date = datetime.datetime.strptime( version["doc_date"], "%m/%d/%y").date() # version text will default to abbreviation provided in CSV # but if there is an unambiguous action from that date with # a version, we'll use that as the document title version_text = version["doc_abbr"] if len(doc_actions[version["doc_date"]]) == 1: version_text = doc_actions[version["doc_date"]][0] b.add_version_link( version_text, version_url, date=version_date, media_type="text/html", on_duplicate="ignore", ) yield b
def _scrape_bill(self, session, bill_data): details = self._parse_bill_details(bill_data) if details is None: return ( senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, (prefix, number, active_version), ) = details bill = Bill( bill_id, legislative_session=session, chamber=bill_chamber, title=title or bill_data["summary"], classification=bill_type, ) if bill_data["summary"]: bill.add_abstract(bill_data["summary"], note="") bill_active_version = None if active_version != "": bill_active_version = bill_data["amendments"]["items"][active_version] else: self.warning("No active version for {}".format(bill_id)) # Parse sponsors. if bill_data["sponsor"] is not None: if bill_data["sponsor"]["rules"] is True: bill.add_sponsorship( "Rules Committee", entity_type="organization", classification="primary", primary=True, ) elif not bill_data["sponsor"]["budget"]: primary_sponsor = bill_data["sponsor"]["member"] bill.add_sponsorship( primary_sponsor["shortName"], entity_type="person", classification="primary", primary=True, ) if bill_active_version: # There *shouldn't* be cosponsors if there is no sponsor. cosponsors = bill_active_version["coSponsors"]["items"] for cosponsor in cosponsors: bill.add_sponsorship( cosponsor["shortName"], entity_type="person", classification="cosponsor", primary=False, ) if bill_active_version: # List companion bill. same_as = bill_active_version.get("sameAs", {}) # Check whether "sameAs" property is populated with at least one bill. if same_as["items"]: # Get companion bill ID. companion_bill_id = same_as["items"][0]["basePrintNo"] # Build companion bill session. start_year = same_as["items"][0]["session"] end_year = start_year + 1 companion_bill_session = "-".join([str(start_year), str(end_year)]) # Attach companion bill data. bill.add_related_bill( companion_bill_id, companion_bill_session, relation_type="companion" ) # Parse actions. chamber_map = {"senate": "upper", "assembly": "lower"} for action in bill_data["actions"]["items"]: chamber = chamber_map[action["chamber"].lower()] action_datetime = datetime.datetime.strptime(action["date"], "%Y-%m-%d") action_date = action_datetime.date() types, _ = NYBillScraper.categorizer.categorize(action["text"]) bill.add_action( action["text"], action_date.strftime("%Y-%m-%d"), chamber=chamber, classification=types, ) # Handling of sources follows. Sources serving either chamber # maintain duplicate data, so we can see certain bill data # through either chamber's resources. However, we have to refer # to a specific chamber's resources if we want to grab certain # specific information such as vote data. # # As such, I'm placing all potential sources in the interest of # thoroughness. - Andy Lo # List Open Legislation API endpoint as a source. api_url = self.api_client.root + self.api_client.resources["bill"].format( session_year=session, bill_id=bill_id, summary="", detail="" ) bill.add_source(api_url) bill.add_source(senate_url) bill.add_source(assembly_url) # Chamber-specific processing. for vote_data in bill_data["votes"]["items"]: yield self._parse_senate_votes(vote_data, bill, api_url) yield from self.scrape_assembly_votes(session, bill, assembly_url, bill_id) # A little strange the way it works out, but the Assembly # provides the HTML version documents and the Senate provides # the PDF version documents. amendments = bill_data["amendments"]["items"] for key, amendment in amendments.items(): version = amendment["printNo"] html_url = ( "http://assembly.state.ny.us/leg/?sh=printbill&bn=" "{}&term={}&Text=Y".format(bill_id, self.term_start_year) ) bill.add_version_link( version, html_url, on_duplicate="ignore", media_type="text/html" ) pdf_url = "http://legislation.nysenate.gov/pdf/bills/{}/{}".format( self.term_start_year, version ) bill.add_version_link( version, pdf_url, on_duplicate="ignore", media_type="application/pdf" ) yield bill
def scrape_bill(self, session, bill_id, chamber): # https://malegislature.gov/Bills/189/SD2739 session_for_url = self.replace_non_digits(session) bill_url = "https://malegislature.gov/Bills/{}/{}".format( session_for_url, bill_id) try: response = self.get(bill_url) self.info("GET (with `requests`) - {}".format(bill_url)) except requests.exceptions.RequestException: self.warning("Server Error on {}".format(bill_url)) return False html = response.text page = lxml.html.fromstring(html) if not page.xpath('//div[contains(@class, "followable")]/h1/text()'): self.warning("Server Error on {}".format(bill_url)) return False # The state website will periodically miss a few bills' titles for a few days # These titles will be extant on the bill list page, but missing on the bill detail page # The titles are eventually populated under one of two markups try: bill_title = page.xpath( '//div[@id="contentContainer"]/div/div/h2/text()')[0] except IndexError: bill_title = None pass if bill_title is None: try: bill_title = page.xpath( '//div[contains(@class,"followable")]/h1/text()')[0] bill_title = bill_title.replace("Bill", "").strip() except IndexError: self.warning( "Couldn't find title for {}; skipping".format(bill_id)) return False bill_types = ["H", "HD", "S", "SD", "SRes"] if re.sub("[0-9]", "", bill_id) not in bill_types: self.warning( "Unsupported bill type for {}; skipping".format(bill_id)) return False if "SRes" in bill_id: bill_id = bill_id.replace("SRes", "SR") bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification="bill", ) bill_summary = None if page.xpath('//p[@id="pinslip"]/text()'): bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0] if bill_summary: bill.add_abstract(bill_summary, "summary") bill.add_source(bill_url) # https://malegislature.gov/Bills/189/SD2739 has a presenter # https://malegislature.gov/Bills/189/S2168 no sponsor # Find the non-blank text of the dt following Sponsor or Presenter, # including any child link text. sponsor = page.xpath( '//dt[text()="Sponsor:" or text()="Presenter:"]/' "following-sibling::dd/descendant-or-self::*/text()[normalize-space()]" ) if sponsor: sponsor = (sponsor[0].replace("*", "").replace("%", "").replace( "This sponsor is an original petitioner.", "").strip()) bill.add_sponsorship(sponsor, classification="primary", primary=True, entity_type="person") self.scrape_cosponsors(bill, bill_url) version = page.xpath( "//div[contains(@class, 'modalBtnGroup')]/" "a[contains(text(), 'Download PDF') and not(@disabled)]/@href") if version: version_url = "https://malegislature.gov{}".format(version[0]) bill.add_version_link("Bill Text", version_url, media_type="application/pdf") self.scrape_actions(bill, bill_url, session) yield bill
def scrape_bill(self, row, session): bill_id = row["LegislationDisplayCode"] amendment = None substitute = None if bill_id.count(" ") > 1: if " w/ " in bill_id: self.info("Found amended bill `{}`".format(bill_id)) bill_id, amendment = bill_id.split(" w/ ") if " -" in bill_id: self.info("Found amended bill `{}`".format(bill_id)) bill_id, amendment = bill_id.split(" -") # A bill can _both_ be amended and be substituted if " for " in bill_id: self.info( "Found substitute to use instead: `{}`".format(bill_id)) substitute, bill_id = bill_id.split(" for ") if amendment is None and substitute is None: raise ValueError("unknown bill_id format: " + bill_id) bill_type = self.classify_bill(bill_id) chamber = "upper" if bill_id.startswith("S") else "lower" bill = Bill( identifier=bill_id, legislative_session=session, chamber=chamber, title=row["LongTitle"], classification=bill_type, ) if row["Synopsis"]: bill.add_abstract(row["Synopsis"], "synopsis") if row["ShortTitle"]: bill.add_title(row["ShortTitle"], "short title") if row["SponsorPersonId"]: self.add_sponsor_by_legislator_id(bill, row["SponsorPersonId"], "primary") if substitute: bill.extras["substitute"] = substitute if amendment: bill.extras["amendment"] = amendment # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API? html_url = "https://legis.delaware.gov/BillDetail?LegislationId={}".format( row["LegislationId"]) bill.add_source(html_url, note="text/html") html = self.lxmlize(html_url) additional_sponsors = html.xpath( '//label[text()="Additional Sponsor(s):"]' "/following-sibling::div/a/@href") for sponsor_url in additional_sponsors: sponsor_id = sponsor_url.replace( "https://legis.delaware.gov/LegislatorDetail?" "personId=", "") self.add_sponsor_by_legislator_id(bill, sponsor_id, "primary") cosponsors = html.xpath('//label[text()="Co-Sponsor(s):"]/' "following-sibling::div/a/@href") for sponsor_url in cosponsors: sponsor_id = sponsor_url.replace( "https://legis.delaware.gov/LegislatorDetail?" "personId=", "") self.add_sponsor_by_legislator_id(bill, sponsor_id, "cosponsor") versions = html.xpath( '//label[text()="Original Text:"]/following-sibling::div/a/@href') for version_url in versions: media_type = self.mime_from_link(version_url) version_name = "Bill Text" bill.add_version_link(version_name, version_url, media_type=media_type) fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href') for fiscal in fiscals: self.scrape_fiscal_note(bill, fiscal) self.scrape_actions(bill, row["LegislationId"]) if row["HasAmendments"] is True: self.scrape_amendments(bill, row["LegislationId"]) yield from self.scrape_votes(bill, row["LegislationId"], session) yield bill
def scrape_bill_type( self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex(), ): bills = (self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr)) archive_year = int(session[0:4]) not_archive_year = archive_year >= 2009 for bill in bills: bill_session = session if bill.session_num != "0": bill_session += " Special Session %s" % bill.session_num bill_id = bill.short_bill_id if bill_id.strip() == "SB77" and session == "20052006": continue fsbill = Bill(bill_id, bill_session, title="", chamber=chamber) if (bill_id.startswith("S") and chamber == "lower") or (bill_id.startswith("A") and chamber == "upper"): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # Construct a fake source url source_url = ("http://leginfo.legislature.ca.gov/faces/" "billNavClient.xhtml?bill_id=%s") % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type="text/html") title = "" type_ = ["bill"] subject = "" all_titles = set() summary = "" # Get digest test (aka "summary") from latest version. if bill.versions and not_archive_year: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = "//caml:DigestText/xhtml:p" els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r"\s+", " ", t) t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t) chunks.append(t) summary = "\n\n".join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime("%m/%d/%y") version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type="application/pdf", date=version_date.date(), ) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ("AB", "SB"): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len( version.short_title) and not version.title.lower( ).startswith("an act"): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == "Yes": type_.append("appropriation") tags = [] if version.fiscal_committee == "Yes": tags.append("fiscal committee") if version.local_program == "Yes": tags.append("local program") if version.urgency == "Yes": tags.append("urgency") if version.taxlevy == "Yes": tags.append("tax levy") if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note="summary") fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras["impact_clause"] = impact_clause fsbill.extras["tags"] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == "Y", entity_type="person", ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r"(Assembly|Senate)($| \(Floor)", actor) if match: actor = { "Assembly": "lower", "Senate": "upper" }[match.group(1)] elif actor.startswith("Governor"): actor = "executive" else: def replacer(matchobj): if matchobj: return { "Assembly": "lower", "Senate": "upper" }[matchobj.group()] else: return matchobj.group() actor = re.sub(r"^(Assembly|Senate)", replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r"\s+", " ", act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r"Com[s]?. on", action.action) and not matched_abbrs: msg = "Failed to extract committee abbr from %r." self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ("Mapping contains no committee name for " "abbreviation %r. Action text was %r.") args = (abbr, action.action) self.warning(msg % args) committees = filter(None, committees) kwargs["committees"] = committees code = re.search(r"C[SXZ]\d+", actor) if code is not None: code = code.group() kwargs["actor_info"] = {"committee_code": code} if not_archive_year: assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace("Coms. on ", "") act_str = act_str.replace("Com. on " + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith("."): act_str = act_str + "." # Determine which chamber the action originated from. changed = False for committee_chamber in ["upper", "lower", "legislature"]: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = "legislature" if actor != action.actor: actor_info = kwargs.get("actor_info", {}) actor_info["details"] = action.actor kwargs["actor_info"] = actor_info # Add strings for related legislators, if any. rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+" legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs["legislators"] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime("%Y-%m-%d"), chamber=actor, classification=kwargs["classification"], ) for committee in kwargs.get("committees", []): action.add_related_entity(committee, entity_type="organization") seen_actions.add((actor, act_str, date)) source_url = ( "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?" ) source_url += f"bill_id={session}{bill.session_num}{fsbill.identifier}" # Votes for non archived years if archive_year > 2009: for vote_num, vote in enumerate(bill.votes): if vote.vote_result == "(PASS)": result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(" ")[0].lower() if first_part in ["asm", "assembly"]: vote_chamber = "lower" # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith("sen"): vote_chamber = "upper" # vote_location = ' '.join(full_loc.split(' ')[1:]) else: # raise ScrapeError("Bad location: %s" % full_loc) # To uncomment continue if vote.motion: motion = vote.motion.motion_text or "" else: motion = "" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" motion = motion.strip() motion = re.compile(r"(\w+)( Extraordinary)? Session$", re.IGNORECASE).sub("", motion) motion = re.compile(r"^(Senate|Assembly) ", re.IGNORECASE).sub("", motion) motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ", "", motion) motion = re.sub(r" \(\w+\)$", "", motion) motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "", motion) motion = re.sub( r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? " r"Urgency Clause$", "(Urgency Clause)", motion, ) motion = re.sub(r"\s+", " ", motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result="pass" if result else "fail", classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {"threshold": vote.threshold} fsvote.add_source(source_url) fsvote.pupa_id = source_url + "#" + str(vote_num) rc = {"yes": [], "no": [], "other": []} for record in vote.votes: if record.vote_code == "AYE": rc["yes"].append(record.legislator_name) elif record.vote_code.startswith("NO"): rc["no"].append(record.legislator_name) else: rc["other"].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote if len(bill.votes) > 0 and archive_year <= 2009: vote_page_url = ( "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?" ) vote_page_url += ( f"bill_id={session}{bill.session_num}{fsbill.identifier}") # parse the bill data page, finding the latest html text data = self.get(vote_page_url).content doc = html.fromstring(data) doc.make_links_absolute(vote_page_url) num_of_votes = len(doc.xpath("//div[@class='status']")) for vote_section in range(1, num_of_votes + 1): lines = doc.xpath( f"//div[@class='status'][{vote_section}]//div[@class='statusRow']" ) date, result, motion, vtype, location = "", "", "", "", "" votes = {} for line in lines: line = line.text_content().split() if line[0] == "Date": date = line[1] date = datetime.datetime.strptime(date, "%m/%d/%y") date = self._tz.localize(date) elif line[0] == "Result": result = "pass" if "PASS" in line[1] else "fail" elif line[0] == "Motion": motion = " ".join(line[1:]) elif line[0] == "Location": location = " ".join(line[1:]) elif len(line) > 1: if line[0] == "Ayes" and line[1] != "Count": votes["yes"] = line[1:] elif line[0] == "Noes" and line[1] != "Count": votes["no"] = line[1:] elif line[0] == "NVR" and line[1] != "Count": votes["not voting"] = line[1:] # Determine chamber based on location first_part = location.split(" ")[0].lower() vote_chamber = "" if first_part in ["asm", "assembly"]: vote_chamber = "lower" elif first_part.startswith("sen"): vote_chamber = "upper" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" if len(motion) > 0: fsvote = VoteEvent( motion_text=motion, start_date=date, result=result, classification=vtype, chamber=vote_chamber, bill=fsbill, ) fsvote.add_source(vote_page_url) fsvote.pupa_id = vote_page_url + "#" + str( vote_section) for how_voted, voters in votes.items(): for voter in voters: voter = voter.replace(",", "") fsvote.vote(how_voted, voter) yield fsvote yield fsbill self.session.expire_all()
def parse_bill(self, chamber, session, bill_id, url): try: page = self.lxmlize(url) except scrapelib.HTTPError as e: self.logger.warning(e) return withdrawn = False if self.parse_bill_field(page, "Last Action") != "": last_action = self.parse_bill_field(page, "Last Action").xpath("text()")[0] if "WITHDRAWN" in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) withdrawn = True if withdrawn: title = "Withdrawn." else: title = self.parse_bill_field(page, "Title").text_content() if "CR" in bill_id: bill_type = "concurrent resolution" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" else: bill_type = "bill" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.subject = self._subjects[bill_id] bill.add_source(url) self.parse_versions(page, bill) self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) self.parse_proposed_amendments(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib["href"] mimetype = get_media_type(source_url) bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) # only grab links in the first table, because proposed amendments have sponsors that are not bill sponsors. for link in page.xpath( "//div[contains(@class,'bill-table')][1]//td/span/a[contains(@href, 'Legislator-Profile')]" ): bill.add_sponsorship( link.text.strip(), classification="primary", entity_type="person", primary=True, ) if page.xpath("//th[contains(text(),'Votes')]"): vote_url = page.xpath("//a[contains(text(),'Vote History')]/@href")[0] yield from self.scrape_votes(vote_url, bill, chamber) bdr_no = self.parse_bill_field(page, "Bill Request Number") if bdr_no != "" and bdr_no.xpath("text()"): bdr = bdr_no.xpath("text()")[0].strip() bill.extras["BDR"] = bdr if self.parse_bill_field(page, "Summary of Original Version") != "": summary = ( self.parse_bill_field(page, "Summary of Original Version") .text_content() .strip() ) bill.add_abstract(summary, note="Summary of Original Version") if withdrawn: action = self.parse_bill_field(page, "Last Action").text_content().strip() wd_date = re.findall(r"\d{2}\/\d{2}\/\d+", action)[0] wd_date = dateutil.parser.parse(wd_date).date() bill.add_action( action, wd_date, chamber=chamber, classification="withdrawal" ) yield bill
def scrape_bills(self, session): session_key = SESSION_KEYS[session] measures_response = self.api_client.get("measures", page=500, session=session_key) legislators = index_legislators(self, session_key) for measure in measures_response: bid = "{} {}".format(measure["MeasurePrefix"], measure["MeasureNumber"]) chamber = self.chamber_code[bid[0]] bill = Bill( bid.replace(" ", ""), legislative_session=session, chamber=chamber, title=measure["RelatingTo"], classification=self.bill_types[measure["MeasurePrefix"][1:]], ) bill.add_abstract(measure["MeasureSummary"].strip(), note="summary") for sponsor in measure["MeasureSponsors"]: legislator_code = sponsor["LegislatoreCode"] # typo in API if legislator_code: try: legislator = legislators[legislator_code] except KeyError: logger.warn( "Legislator {} not found in session {}".format( legislator_code, session)) legislator = legislator_code bill.add_sponsorship( name=legislator, classification={ "Chief": "primary", "Regular": "cosponsor" }[sponsor["SponsorLevel"]], entity_type="person", primary=True if sponsor["SponsorLevel"] == "Chief" else False, ) bill.add_source( "https://olis.leg.state.or.us/liz/{session}/Measures/Overview/{bid}" .format(session=session_key, bid=bid.replace(" ", ""))) for document in measure["MeasureDocuments"]: # TODO: probably mixing documents & versions here - should revisit document_url = url_fix(document["DocumentUrl"]) try: bill.add_version_link( document["VersionDescription"], document_url, media_type="application/pdf", ) except ValueError: logger.warn( "Duplicate link found for {}".format(document_url)) for agenda_item in measure["CommitteeAgendaItems"]: for document in agenda_item["CommitteeProposedAmendments"]: if "adopted" in document["Meaning"].lower(): amd_name = "{} Amendment {}".format( document["CommitteeCode"], document["AmendmentNumber"]) amendment_url = url_fix( document["ProposedAmendmentUrl"]) bill.add_version_link( amd_name, amendment_url, media_type="application/pdf", on_duplicate="ignore", ) for action in measure["MeasureHistoryActions"]: classifiers = self.determine_action_classifiers( action["ActionText"]) when = datetime.datetime.strptime(action["ActionDate"], "%Y-%m-%dT%H:%M:%S") when = self.tz.localize(when) bill.add_action( action["ActionText"], when, chamber=self.chamber_code[action["Chamber"]], classification=classifiers, ) yield bill
def scrape_bill(self, chamber, session, doc_type, url, bill_type=None): try: html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) except scrapelib.HTTPError as e: assert ( "500" in e.args[0] ), "Unexpected error when accessing page: {}".format(e) self.warning("500 error for bill page; skipping bill") return # bill id, title, summary bill_num = re.findall(r"DocNum=(\d+)", url)[0] bill_type = bill_type or DOC_TYPES[doc_type[1:]] bill_id = doc_type + bill_num title = doc.xpath( '//span[text()="Short Description:"]/following-sibling::span[1]/' "text()" )[0].strip() # 1. Find the heading with "Synopsis As Introduced" for text. # 2. Go to the next heading. # 3. Backtrack and grab everything to, but not including, #1. # 4. Grab text of all, including nested, nodes. summary_nodes = doc.xpath( '//span[text()="Synopsis As Introduced"]/following-sibling::span[contains(@class, "heading2")]/' 'preceding-sibling::*[preceding-sibling::span[text()="Synopsis As Introduced"]]//' "text()" ) summary = "\n".join([node.strip() for node in summary_nodes]) bill = Bill( identifier=bill_id, legislative_session=session, title=title, classification=bill_type, chamber=chamber, ) bill.add_abstract(summary, note="") bill.add_source(url) # sponsors sponsor_list = build_sponsor_list(doc.xpath('//a[contains(@class, "content")]')) # don't add just yet; we can make them better using action data committee_actors = {} # actions action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td') for date, actor, action_elem in group(action_tds, 3): date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") date = self.localize(date).date() actor = actor.text_content() if actor == "House": actor_id = {"classification": "lower"} elif actor == "Senate": actor_id = {"classification": "upper"} action = action_elem.text_content() classification, related_orgs = _categorize_action(action) # if related_orgs and any(c.startswith("committee") for c in classification): # ((name, source),) = [ # (a.text, a.get("href")) # for a in action_elem.xpath("a") # if "committee" in a.get("href") # ] # source = canonicalize_url(source) # actor_id = {"sources__url": source, "classification": "committee"} # committee_actors[source] = name bill.add_action( action, date, organization=actor_id, classification=classification, related_entities=related_orgs, ) if action.lower().find("sponsor") != -1: self.refine_sponsor_list(actor, action, sponsor_list, bill_id) # now add sponsors for spontype, sponsor, chamber, official_type in sponsor_list: if official_type == "primary": primary = True else: primary = False if chamber: bill.add_sponsorship( sponsor, spontype, "person", primary=primary, chamber=chamber ) else: bill.add_sponsorship(spontype, sponsor, "person", primary=primary) # versions version_url = doc.xpath('//a[text()="Full Text"]/@href')[0] self.scrape_documents(bill, version_url) yield bill votes_url = doc.xpath('//a[text()="Votes"]/@href')[0] yield from self.scrape_votes(session, bill, votes_url, committee_actors)
def scrape(self, session=None): if not session: session = self.jurisdiction.legislative_sessions[-1]["identifier"] self.info("no session specified, using %s", session) chamber_types = { "H": "lower", "S": "upper", "G": "executive", "C": "legislature", } session_id = SESSION_SITE_IDS[session] self._url_base += session_id + "/" bill_url_base = "https://lis.virginia.gov/cgi-bin/" self.load_members() self.load_sponsors() self.load_amendments() self.load_history() self.load_summaries() self.load_votes() self.load_bills() for bill in self._bills: bill = self._bills[bill][0] bill_id = bill["bill_id"] chamber = chamber_types[bill_id[0]] bill_type = { "B": "bill", "J": "joint resolution", "R": "resolution" }[bill_id[1]] b = Bill( bill_id, session, bill["bill_description"], chamber=chamber, classification=bill_type, ) bill_url = bill_url_base + f"legp604.exe?{session_id}+sum+{bill_id}" b.add_source(bill_url) # Long Bill ID needs to have 6 characters to work with vote urls, sponsors, and summaries. # Fill in blanks with 0s long_bill_id = bill_id if len(bill_id) == 3: long_bill_id = bill_id[0:2] + "000" + bill_id[-1] elif len(bill_id) == 4: long_bill_id = bill_id[0:2] + "00" + bill_id[-2:] elif len(bill_id) == 5: long_bill_id = bill_id[0:2] + "0" + bill_id[-3:] # Sponsors for spon in self._sponsors[long_bill_id]: sponsor_type = spon["patron_type"] if sponsor_type.endswith("Chief Patron"): sponsor_type = "primary" else: sponsor_type = "cosponsor" b.add_sponsorship( spon["member_name"], classification=sponsor_type, entity_type="person", primary=sponsor_type == "primary", ) # Summary summary_texts = self._summaries[long_bill_id] for sum_text in summary_texts: b.add_abstract(sum_text["summary_text"], sum_text["summary_type"]) # Amendment docs amendments = self._amendments[bill_id] for amend in amendments: doc_link = ( bill_url_base + f"legp604.exe?{session_id}+amd+{amend['txt_docid']}") b.add_document_link("Amendment: " + amend["txt_docid"], doc_link, media_type="text/html") # Action text is used to improve version text actions_text = [] # History and then votes for hist in self._history[bill_id]: action = hist["history_description"] action_date = hist["history_date"] date = datetime.datetime.strptime(action_date, "%m/%d/%y").date() chamber = chamber_types[action[0]] vote_id = hist["history_refid"] cleaned_action = action[2:] actions_text.append(cleaned_action) # categorize actions for pattern, atype in ACTION_CLASSIFIERS: if re.match(pattern, cleaned_action): break else: atype = None if atype != SKIP: b.add_action(cleaned_action, date, chamber=chamber, classification=atype) if len(vote_id) > 0: total_yes = 0 total_no = 0 total_not_voting = 0 total_abstain = 0 for v in self._votes[vote_id]: if v["vote_result"] == "yes": total_yes += 1 elif v["vote_result"] == "no": total_no += 1 elif v["vote_result"] == "not voting": total_not_voting += 1 elif v["vote_result"] == "abstain": total_abstain += 1 vote = VoteEvent( identifier=vote_id, start_date=date, chamber=chamber, motion_text=cleaned_action, result="pass" if total_yes > total_no else "fail", classification="passage", bill=b, ) vote.set_count("yes", total_yes) vote.set_count("no", total_no) vote.set_count("not voting", total_not_voting) vote.set_count("abstain", total_abstain) vote_url = ( bill_url_base + f"legp604.exe?{session_id}+vot+{vote_id}+{long_bill_id}" ) vote.add_source(vote_url) for v in self._votes[vote_id]: vote.vote(v["vote_result"], v["member_id"]) yield vote # Versions for version in bill["text_docs"]: # Checks if abbr is blank as not every bill has multiple versions if len(version["doc_abbr"]) > 0: version_url = ( bill_url_base + f"legp604.exe?{session_id}+ful+{version['doc_abbr']}") version_date = datetime.datetime.strptime( version["doc_date"], "%m/%d/%y").date() version_text = version["doc_abbr"] for act in actions_text: if version_text in act: version_text = act b.add_version_link( version_text, version_url, date=version_date, media_type="text/html", on_duplicate="ignore", ) yield b
def scrape(self, session=None): self._bill_prefix_map = { "HB": {"type": "bill", "url_segment": "bills/house"}, "HR": {"type": "resolution", "url_segment": "resolutions/house/simple"}, "HCR": { "type": "concurrent resolution", "url_segment": "resolutions/house/concurrent", }, "HJR": { "type": "joint resolution", "url_segment": "resolutions/house/joint", }, "HC": { "type": "concurrent resolution", "url_segment": "resolutions/house/concurrent", }, "HJ": { "type": "joint resolution", "url_segment": "resolutions/house/joint", }, "SB": {"type": "bill", "url_segment": "bills/senate"}, "SR": {"type": "resolution", "url_segment": "resolutions/senate/simple"}, "SCR": { "type": "concurrent resolution", "url_segment": "resolutions/senate/concurrent", }, "SJR": { "type": "joint resolution", "url_segment": "resolutions/senate/joint", }, "SC": { "type": "concurrent resolution", "url_segment": "resolutions/senate/concurrent", }, "SJ": { "type": "joint resolution", "url_segment": "resolutions/senate/joint", }, } api_base_url = "https://api.iga.in.gov" # ah, indiana. it's really, really hard to find # pdfs in their web interface. Super easy with # the api, but a key needs to be passed # in the headers. To make these documents # viewable to the public and our scrapers, # we've put up a proxy service at this link # using our api key for pdf document access. client = ApiClient(self) r = client.get("bills", session=session) all_pages = client.unpaginate(r) for b in all_pages: bill_id = b["billName"] disp_bill_id = b["displayName"] bill_link = b["link"] api_source = api_base_url + bill_link try: bill_json = client.get("bill", session=session, bill_id=bill_id.lower()) except scrapelib.HTTPError: self.logger.warning("Bill could not be accessed. Skipping.") continue title = bill_json["description"] if title == "NoneNone": title = None # sometimes description is blank # if that's the case, we can check to see if # the latest version has a short description if not title: title = bill_json["latestVersion"]["shortDescription"] # and if that doesn't work, use the bill_id but throw a warning if not title: title = bill_id self.logger.warning("Bill is missing a title, using bill id instead.") bill_prefix = self._get_bill_id_components(bill_id)[0] original_chamber = ( "lower" if bill_json["originChamber"].lower() == "house" else "upper" ) bill_type = self._bill_prefix_map[bill_prefix]["type"] bill = Bill( disp_bill_id, legislative_session=session, chamber=original_chamber, title=title, classification=bill_type, ) bill.add_source(self._get_bill_url(session, bill_id)) bill.add_source(api_source) # sponsors for s in bill_json["authors"]: self._add_sponsor_if_not_blank(bill, s, classification="author") for s in bill_json["coauthors"]: self._add_sponsor_if_not_blank(bill, s, classification="coauthor") for s in bill_json["sponsors"]: self._add_sponsor_if_not_blank(bill, s, classification="sponsor") for s in bill_json["cosponsors"]: self._add_sponsor_if_not_blank(bill, s, classification="cosponsor") # actions action_link = bill_json["actions"]["link"] api_source = api_base_url + action_link try: actions = client.get( "bill_actions", session=session, bill_id=bill_id.lower() ) except scrapelib.HTTPError: self.logger.warning("Could not find bill actions page") actions = {"items": []} for a in actions["items"]: action_desc = a["description"] if "governor" in action_desc.lower(): action_chamber = "executive" elif a["chamber"]["name"].lower() == "house": action_chamber = "lower" else: action_chamber = "upper" date = a["date"] if not date: self.logger.warning("Action has no date, skipping") continue # convert time to pupa fuzzy time date = date.replace("T", " ") # TODO: if we update pupa to accept datetimes we can drop this line date = date.split()[0] action_type = [] d = action_desc.lower() committee = None reading = False if "first reading" in d: action_type.append("reading-1") reading = True if "second reading" in d or "reread second time" in d: action_type.append("reading-2") reading = True if "third reading" in d or "reread third time" in d: action_type.append("reading-3") if "passed" in d: action_type.append("passage") if "failed" in d: action_type.append("failure") reading = True if "adopted" in d and reading: action_type.append("passage") if ( "referred" in d and "committee on" in d or "reassigned" in d and "committee on" in d ): committee = d.split("committee on")[-1].strip() action_type.append("referral-committee") if "committee report" in d: if "pass" in d: action_type.append("committee-passage") if "fail" in d: action_type.append("committee-failure") if "amendment" in d and "without amendment" not in d: if "pass" in d or "prevail" in d or "adopted" in d: action_type.append("amendment-passage") if "fail" or "out of order" in d: action_type.append("amendment-failure") if "withdraw" in d: action_type.append("amendment-withdrawal") if "signed by the governor" in d: action_type.append("executive-signature") if "vetoed by the governor" in d: action_type.append("executive-veto") if len(action_type) == 0: # calling it other and moving on with a warning self.logger.warning( "Could not recognize an action in '{}'".format(action_desc) ) action_type = None a = bill.add_action( chamber=action_chamber, description=action_desc, date=date, classification=action_type, ) if committee: a.add_related_entity(committee, entity_type="organization") # subjects subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]] for subject in subjects: bill.add_subject(subject) # Abstract if bill_json["latestVersion"]["digest"]: bill.add_abstract(bill_json["latestVersion"]["digest"], note="Digest") # put this behind a flag 2021-03-18 (openstates/issues#291) if not SCRAPE_WEB_VERSIONS: # votes yield from self._process_votes( bill_json["latestVersion"]["rollcalls"], disp_bill_id, original_chamber, session, ) # versions self.deal_with_version( bill_json["latestVersion"], bill, bill_id, original_chamber, session ) for version in bill_json["versions"][::-1]: self.deal_with_version( version, bill, bill_id, original_chamber, session, ) else: self.scrape_web_versions(session, bill, bill_id) yield bill
def scrape_bill(self, bill_num, session): chamber_map = {"House": "lower", "Senate": "upper", "LSO": "executive"} # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45 bill_json_url = ( "http://wyoleg.gov/LsoService/api/BillInformation/{}/" "{}?calendarDate=".format(session, bill_num) ) if self.is_special: bill_json_url = ( "http://wyoleg.gov/LsoService/api/BillInformation/{}/" "{}?specialSessionValue=1&calendarDate=".format(session[0:4], bill_num) ) try: response = self.get(bill_json_url) bill_json = json.loads(response.content.decode("utf-8")) except scrapelib.HTTPError: return None chamber = "lower" if bill_json["bill"][0] else "upper" bill = Bill( identifier=bill_json["bill"], legislative_session=session, title=bill_json["catchTitle"], chamber=chamber, classification="bill", ) bill.add_title(bill_json["billTitle"]) source_url = "http://lso.wyoleg.gov/Legislation/{}/{}".format( session, bill_json["bill"] ) if self.is_special: source_url = "http://lso.wyoleg.gov/Legislation/{}/{}?specialSessionValue=1".format( session[0:4], bill_json["bill"] ) bill.add_source(source_url) for action_json in bill_json["billActions"]: utc_action_date = self.parse_local_date(action_json["statusDate"]) actor = None if action_json["location"] and action_json["location"] in chamber_map: actor = chamber_map[action_json["location"]] action = bill.add_action( chamber=actor, description=action_json["statusMessage"], date=utc_action_date, classification=categorize_action(action_json["statusMessage"]), ) action.extras = {"billInformationID": action_json["billInformationID"]} if bill_json["introduced"]: url = "http://wyoleg.gov/{}".format(bill_json["introduced"]) bill.add_version_link( note="Introduced", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["enrolledAct"]: url = "http://wyoleg.gov/{}".format(bill_json["enrolledAct"]) bill.add_version_link( note="Enrolled", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["fiscalNote"]: url = "http://wyoleg.gov/{}".format(bill_json["fiscalNote"]) bill.add_document_link( note="Fiscal Note", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["digest"]: url = "http://wyoleg.gov/{}".format(bill_json["digest"]) bill.add_document_link( note="Bill Digest", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["vetoes"]: for veto in bill_json["vetoes"]: url = "http://wyoleg.gov/{}".format(veto["vetoLinkPath"]) bill.add_version_link( note=veto["vetoLinkText"], url=url, media_type="application/pdf", # optional but useful! ) for amendment in bill_json["amendments"]: # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf # TODO: There are no special session amendments yet, # but check this url format for specials url = "http://wyoleg.gov/{}/Amends/{}.pdf".format( session[0:4], amendment["amendmentNumber"] ) if amendment["sponsor"] and amendment["status"]: title = "Amendment {} ({}) - {} ({})".format( amendment["amendmentNumber"], amendment["order"], amendment["sponsor"], amendment["status"], ) else: title = "Amendment {} ({})".format( amendment["amendmentNumber"], amendment["order"] ) # add versions of the bill text version = bill.add_version_link( note=title, url=url, media_type="application/pdf" ) version["extras"] = { "amendmentNumber": amendment["amendmentNumber"], "sponsor": amendment["sponsor"], } for sponsor in bill_json["sponsors"]: status = "primary" if sponsor["primarySponsor"] else "cosponsor" sponsor_type = "person" if sponsor["sponsorTitle"] else "organization" bill.add_sponsorship( name=sponsor["name"], classification=status, entity_type=sponsor_type, primary=sponsor["primarySponsor"], ) if bill_json["summary"]: bill.add_abstract(note="summary", abstract=bill_json["summary"]) if bill_json["enrolledNumber"]: bill.extras["wy_enrolled_number"] = bill_json["enrolledNumber"] if bill_json["chapter"]: bill.extras["chapter"] = bill_json["chapter"] if bill_json["effectiveDate"]: eff = datetime.datetime.strptime(bill_json["effectiveDate"], "%m/%d/%Y") bill.extras["effective_date"] = eff.strftime("%Y-%m-%d") bill.extras["wy_bill_id"] = bill_json["id"] for vote_json in bill_json["rollCalls"]: yield from self.scrape_vote(bill, vote_json, session) yield bill
def _parse_senate_billpage(self, bill_url, year): bill_page = self.lxmlize(bill_url) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath( '//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath( '//*[@id="lblBriefDesc"]')[0].text_content() # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bid == "XXXXXX": self.info("Skipping Junk Bill") return bill = Bill( bill_id, title=bill_desc, chamber="upper", legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_abstract(bill_desc, note="abstract") bill.add_source(bill_url) if bill_title: bill.add_title(bill_title) # Get the primary sponsor try: sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0] except IndexError: sponsor = bill_page.xpath('//span[@id="lSponsor"]')[0] bill_sponsor = sponsor.text_content() # bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsorship(bill_sponsor, entity_type="person", classification="primary", primary=True) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get("href"): self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib["href"]) # get the actions action_url = bill_page.xpath('//a[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib["href"] self._parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//a[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.get("href"): self._parse_senate_bill_versions(bill, versions_url[0].attrib["href"]) amendment_links = bill_page.xpath( '//a[contains(@href,"ShowAmendment.asp")]') for link in amendment_links: link_text = link.xpath("string(.)").strip() if "adopted" in link_text.lower(): link_url = link.xpath("@href")[0] bill.add_version_link( link_text, link_url, media_type="application/pdf", on_duplicate="ignore", ) yield bill
def test_full_bill(): create_jurisdiction() person = Person.objects.create(name="Adam Smith") lower = Organization.objects.create(jurisdiction_id="jid", name="House", classification="lower") Membership.objects.create(person_id=person.id, organization_id=lower.id) Organization.objects.create( jurisdiction_id="jid", name="Arbitrary Committee", classification="committee", parent=lower, ) oldbill = ScrapeBill( "HB 99", "1899", "Axe & Tack Tax Act", classification="tax bill", chamber="lower", ) bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", classification="tax bill", chamber="lower") bill.subject = ["taxes", "axes"] bill.add_identifier("SB 9") bill.add_title("Tack & Axe Tax Act") bill.add_action("introduced in house", "1900-04-01", chamber="lower") act = bill.add_action("sent to arbitrary committee", "1900-04-04", chamber="lower") act.add_related_entity( "arbitrary committee", "organization", _make_pseudo_id(name="Arbitrary Committee"), ) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship( "Adam Smith", classification="extra sponsor", entity_type="person", primary=False, entity_id=_make_pseudo_id(name="Adam Smith"), ) bill.add_sponsorship("Jane Smith", classification="lead sponsor", entity_type="person", primary=True) bill.add_abstract( "This is an act about axes and taxes and tacks.", note="official", date="1969-10-20", ) bill.add_document_link("Fiscal Note", "http://example.com/fn.pdf", media_type="application/pdf") bill.add_document_link("Fiscal Note", "http://example.com/fn.html", media_type="text/html") bill.add_version_link("Fiscal Note", "http://example.com/v/1", media_type="text/html") bill.add_source("http://example.com/source") # import bill BillImporter("jid").import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier="HB 1") assert b.from_organization.classification == "lower" assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ["taxes", "axes"] assert b.abstracts.get().note == "official" assert b.abstracts.get().date == "1969-10-20" # other_title, other_identifier added assert b.other_titles.get().title == "Tack & Axe Tax Act" assert b.other_identifiers.get().identifier == "SB 9" # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get( classification="lower") assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert actions[1].related_entities.get( ).organization == Organization.objects.get(classification="committee") # action computed fields assert b.first_action_date == "1900-04-01" assert b.latest_action_date == "1900-04-04" assert b.latest_action_description == "sent to arbitrary committee" # related_bills were added rb = b.related_bills.get() assert rb.identifier == "HB 99" # and bill got resolved assert rb.related_bill.identifier == "HB 99" # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 person = Person.objects.get(name="Adam Smith") for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def scrape_bill(self, session, chamber, bill_type, url): bill_html = self.get(url).text bill_page = lxml.html.fromstring(bill_html) qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query)) bill_id = "{}{}".format(qs["billtype"], qs["billnumber"]) versions = bill_page.xpath( "//table[contains(@id, 'GridViewVersions')]")[0] metainf_table = bill_page.xpath( '//div[contains(@id, "itemPlaceholder")]//table[1]')[0] action_table = bill_page.xpath( '//div[contains(@id, "UpdatePanel1")]//table[1]')[0] meta = self.parse_bill_metainf_table(metainf_table) subs = [s.strip() for s in meta["Report Title"].split(";")] if "" in subs: subs.remove("") b = Bill( bill_id, session, meta["Measure Title"], chamber=chamber, classification=bill_type, ) if meta["Description"]: b.add_abstract(meta["Description"], "description") for subject in subs: b.add_subject(subject) if url: b.add_source(url) prior_session = "{} Regular Session".format(str(int(session[:4]) - 1)) companion = meta["Companion"].strip() if companion: b.add_related_bill( identifier=companion.replace(u"\xa0", " "), legislative_session=prior_session, relation_type="companion", ) if bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()" ): prior = bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()" )[-1] if "carried over" in prior.lower(): b.add_related_bill( identifier=bill_id.replace(u"\xa0", " "), legislative_session=prior_session, relation_type="companion", ) for sponsor in meta["Introducer(s)"]: if "(Introduced by request of another party)" in sponsor: sponsor = sponsor.replace( " (Introduced by request of another party)", "") b.add_sponsorship(sponsor, "primary", "person", True) self.parse_bill_versions_table(b, versions) self.parse_testimony(b, bill_page) self.parse_cmte_reports(b, bill_page) yield from self.parse_bill_actions_table(b, action_table, bill_id, session, url, chamber) yield b
def scrape(self, session=None, chamber=None): bill_type_map = { "B": "bill", "R": "resolution", "JR": "joint resolution", "CR": "concurrent resolution", } chamber_map = { "H": "lower", "S": "upper", "J": "joint", "E": "legislature", # Effective date } action_code_map = { "HI": None, "SI": None, "HH": None, "SH": None, "HPF": ["introduction"], "HDSAS": None, "SPF": ["introduction"], "HSR": ["reading-2"], "SSR": ["reading-2"], "HFR": ["reading-1"], "SFR": ["reading-1"], "HRECM": ["withdrawal", "referral-committee"], "SRECM": ["withdrawal", "referral-committee"], "SW&C": ["withdrawal", "referral-committee"], "HW&C": ["withdrawal", "referral-committee"], "HRA": ["passage"], "SRA": ["passage"], "HPA": ["passage"], "HRECO": None, "SPA": ["passage"], "HTABL": None, # 'House Tabled' - what is this? "SDHAS": None, "HCFR": ["committee-passage-favorable"], "SCFR": ["committee-passage-favorable"], "HRAR": ["referral-committee"], "SRAR": ["referral-committee"], "STR": ["reading-3"], "SAHAS": None, "SE": ["passage"], "SR": ["referral-committee"], "HTRL": ["reading-3", "failure"], "HTR": ["reading-3"], "S3RLT": ["reading-3", "failure"], "HASAS": None, "S3RPP": None, "STAB": None, "SRECO": None, "SAPPT": None, "HCA": None, "HNOM": None, "HTT": None, "STT": None, "SRECP": None, "SCRA": None, "SNOM": None, "S2R": ["reading-2"], "H2R": ["reading-2"], "SENG": ["passage"], "HENG": ["passage"], "HPOST": None, "HCAP": None, "SDSG": ["executive-signature"], "SSG": ["executive-receipt"], "Signed Gov": ["executive-signature"], "HDSG": ["executive-signature"], "HSG": ["executive-receipt"], "EFF": None, "HRP": None, "STH": None, "HTS": None, } if not session: session = self.latest_session() self.info("no session specified, using %s", session) sid = SESSION_SITE_IDS[session] legislation = backoff(self.lservice.GetLegislationForSession, sid)[ "LegislationIndex" ] for leg in legislation: lid = leg["Id"] instrument = backoff(self.lservice.GetLegislationDetail, lid) history = [x for x in instrument["StatusHistory"][0]] actions = reversed( [ { "code": x["Code"], "action": x["Description"], "_guid": x["Id"], "date": x["Date"], } for x in history ] ) guid = instrument["Id"] # A little bit hacky. bill_prefix = instrument["DocumentType"] bill_chamber = chamber_map[bill_prefix[0]] bill_type = bill_type_map[bill_prefix[1:]] bill_id = "%s %s" % (bill_prefix, instrument["Number"]) if instrument["Suffix"]: bill_id += instrument["Suffix"] title = instrument["Caption"] description = instrument["Summary"] if title is None: continue bill = Bill( bill_id, legislative_session=session, chamber=bill_chamber, title=title, classification=bill_type, ) bill.add_abstract(description, note="description") bill.extras = {"guid": guid} if instrument["Votes"]: for vote_ in instrument["Votes"]: _, vote_ = vote_ vote_ = backoff(self.vservice.GetVote, vote_[0]["VoteId"]) vote = VoteEvent( start_date=vote_["Date"].strftime("%Y-%m-%d"), motion_text=vote_["Caption"] or "Vote on Bill", chamber={"House": "lower", "Senate": "upper"}[vote_["Branch"]], result="pass" if vote_["Yeas"] > vote_["Nays"] else "fail", classification="passage", bill=bill, ) vote.set_count("yes", vote_["Yeas"]) vote.set_count("no", vote_["Nays"]) vote.set_count("other", vote_["Excused"] + vote_["NotVoting"]) vote.add_source(self.vsource) methods = {"Yea": "yes", "Nay": "no"} if vote_["Votes"] is not None: for vdetail in vote_["Votes"][0]: whom = vdetail["Member"] how = vdetail["MemberVoted"] if whom["Name"] == "VACANT": continue name, district = vote_name_pattern.search( whom["Name"] ).groups() vote.vote(methods.get(how, "other"), name, note=district) yield vote ccommittees = defaultdict(list) committees = instrument["Committees"] if committees: for committee in committees[0]: ccommittees[ {"House": "lower", "Senate": "upper"}[committee["Type"]] ].append(committee["Name"]) for action in actions: action_chamber = chamber_map[action["code"][0]] try: action_types = action_code_map[action["code"]] except KeyError: error_msg = "Code {code} for action {action} not recognized.".format( code=action["code"], action=action["action"] ) self.logger.warning(error_msg) action_types = None committees = [] if action_types and any(("committee" in x for x in action_types)): committees = [str(x) for x in ccommittees.get(action_chamber, [])] act = bill.add_action( action["action"], action["date"].strftime("%Y-%m-%d"), classification=action_types, chamber=action_chamber, ) for committee in committees: act.add_related_entity(committee, "organization") act.extras = {"code": action["code"], "guid": action["_guid"]} sponsors = [] if instrument["Authors"]: sponsors = instrument["Authors"]["Sponsorship"] if "Sponsors" in instrument and instrument["Sponsors"]: sponsors += instrument["Sponsors"]["Sponsorship"] sponsors = [(x["Type"], self.get_member(x["MemberId"])) for x in sponsors] for typ, sponsor in sponsors: name = "{First} {Last}".format(**dict(sponsor["Name"])) bill.add_sponsorship( name, entity_type="person", classification="primary" if "Author" in typ else "secondary", primary="Author" in typ, ) for version in instrument["Versions"]["DocumentDescription"]: name, url, doc_id, version_id = [ version[x] for x in ["Description", "Url", "Id", "Version"] ] link = bill.add_version_link(name, url, media_type="application/pdf") link["extras"] = { "_internal_document_id": doc_id, "_version_id": version_id, } bill.add_source(self.msource) bill.add_source(self.lsource) bill.add_source(SOURCE_URL.format(**{"session": session, "bid": guid})) yield bill