def parse_versions(self, page, bill): xpath_expr = '//tr[th[text()="Bill Documents"]]/td[1]/a' version_count = 0 for row in page.xpath(xpath_expr): source_url = row.attrib["href"] version_title = row.xpath("text()")[0].strip() mimetype = get_media_type(source_url) if mimetype is None: self.warning("Unknown mimetype for {}".format(source_url)) bill.add_version_link(version_title, source_url, media_type=mimetype) version_count += 1 return version_count
def parse_bill(self, chamber, session, bill_id, url): try: page = self.lxmlize(url) except scrapelib.HTTPError as e: self.logger.warning(e) return if self.parse_bill_field(page, "Last Action") != "": last_action = self.parse_bill_field( page, "Last Action").xpath("text()")[0] if "WITHDRAWN" in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) return title = self.parse_bill_field(page, "Title").text_content() if "CR" in bill_id: bill_type = "concurrent resolution" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" else: bill_type = "bill" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.subject = self._subjects[bill_id] bill.add_source(url) version_ct = self.parse_versions(page, bill) if version_ct < 1: # Bill withdrawn self.logger.warning("Bill withdrawn.") return self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) self.parse_proposed_amendments(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib["href"] mimetype = get_media_type(source_url) bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) for link in page.xpath( "//td/span/a[contains(@href, 'Legislator-Profile')]"): bill.add_sponsorship( link.text.strip(), classification="primary", entity_type="person", primary=True, ) if page.xpath("//th[contains(text(),'Votes')]"): vote_url = page.xpath( "//a[contains(text(),'Vote History')]/@href")[0] yield from self.scrape_votes(vote_url, bill, chamber) bdr_no = self.parse_bill_field(page, "Bill Request Number") if bdr_no != "" and bdr_no.xpath("text()"): bdr = bdr_no.xpath("text()")[0].strip() bill.extras["BDR"] = bdr yield bill
def scrape_lower(self): url = "https://www.house.leg.state.mn.us/Schedules/All" page = self.lxmlize(url) for row in page.xpath('//div[contains(@class,"my-2 d-print-block")]'): # print(row.text_content()) # skip floor sessions and unlinked events if not row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b' ): continue # skip joint ones, we'll get those from the senate API if row.xpath('div[contains(@class,"card-header bg-joint")]'): continue # top-level committee com = row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()' )[0].strip() com_link = row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/@href' )[0] when = (row.xpath( 'div[contains(@class,"card-header")]/span[contains(@class,"text-white")]/text()' )[0].replace("\r\n", "").strip()) when = dateutil.parser.parse(when) when = self._tz.localize(when) if row.xpath('.//b[.="Location:"]'): where = row.xpath( './/b[.="Location:"]/following-sibling::text()[1]' )[0].strip() else: where = "See committee page" if row.xpath('.//b[.="Agenda:"]'): desc = "\n".join( row.xpath('.//b[.="Agenda:"]/following-sibling::div/text()' )).strip() else: desc = "See committee page" event = Event( name=com, start_date=when, location_name=where, classification="committee-meeting", description=desc, ) event.add_source(com_link) for bill in get_bill_ids(desc): event.add_bill(desc) if row.xpath( ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]" ): agenda = event.add_agenda_item("Bills") for bill_id in row.xpath( ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]/text()" ): agenda.add_bill(bill_id.strip()) for attachment in row.xpath(".//ul/li/div/a"): doc_url = attachment.xpath("@href")[0] doc_name = attachment.xpath("text()")[0].strip() # if they don't provide a name just use the filename if doc_name == "": parsed_url = urlparse(doc_url) doc_name = os.path.basename(parsed_url) # sometimes broken links to .msg files (emails?) are attached, # they always 404. if doc_url.endswith(".msg"): continue media_type = get_media_type(doc_url) event.add_document(doc_name, doc_url, media_type=media_type, on_duplicate="ignore") for committee in row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()' ): event.add_participant(committee, type="committee", note="host") yield event
def parse_bill(self, chamber, session, bill_id, url): try: page = self.lxmlize(url) except scrapelib.HTTPError as e: self.logger.warning(e) return withdrawn = False if self.parse_bill_field(page, "Last Action") != "": last_action = self.parse_bill_field(page, "Last Action").xpath("text()")[0] if "WITHDRAWN" in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) withdrawn = True if withdrawn: title = "Withdrawn." else: title = self.parse_bill_field(page, "Title").text_content() if "CR" in bill_id: bill_type = "concurrent resolution" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" else: bill_type = "bill" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.subject = self._subjects[bill_id] bill.add_source(url) self.parse_versions(page, bill) self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) self.parse_proposed_amendments(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib["href"] mimetype = get_media_type(source_url) bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) # only grab links in the first table, because proposed amendments have sponsors that are not bill sponsors. for link in page.xpath( "//div[contains(@class,'bill-table')][1]//td/span/a[contains(@href, 'Legislator-Profile')]" ): bill.add_sponsorship( link.text.strip(), classification="primary", entity_type="person", primary=True, ) if page.xpath("//th[contains(text(),'Votes')]"): vote_url = page.xpath("//a[contains(text(),'Vote History')]/@href")[0] yield from self.scrape_votes(vote_url, bill, chamber) bdr_no = self.parse_bill_field(page, "Bill Request Number") if bdr_no != "" and bdr_no.xpath("text()"): bdr = bdr_no.xpath("text()")[0].strip() bill.extras["BDR"] = bdr if self.parse_bill_field(page, "Summary of Original Version") != "": summary = ( self.parse_bill_field(page, "Summary of Original Version") .text_content() .strip() ) bill.add_abstract(summary, note="Summary of Original Version") if withdrawn: action = self.parse_bill_field(page, "Last Action").text_content().strip() wd_date = re.findall(r"\d{2}\/\d{2}\/\d+", action)[0] wd_date = dateutil.parser.parse(wd_date).date() bill.add_action( action, wd_date, chamber=chamber, classification="withdrawal" ) yield bill