def scrape_meetings(self, meetings, group): """ Scrape and save event data from a list of meetings. Arguments: meetings -- A list of lxml elements containing event information group -- The type of meeting. The legislature site applies different formatting to events based on which group they correspond to. `group` should be one of the following strings: 'house', 'senate', or 'commission'. """ for meeting in meetings: when = self.get_date(meeting) description = self.get_description(meeting) location = self.get_location(meeting) if when and description and location: event = Event( name=description, start_date=when.replace(tzinfo=self.tz), description=description, location_name=location, ) agenda = self.get_agenda(meeting) if agenda: event.add_agenda_item(agenda) event.add_source(url) yield event
def scrape_meeting_notice(self, chamber, item, url): # Since Event Name is not provided for all mettings. event_name = str(item["CommitteeName"]) # 04/25/2012 03:00:00 PM fmt = "%m/%d/%y %I:%M %p" start_time = dt.datetime.strptime(str(item["MeetingDateTime"]), fmt) location_name = str(item["AddressAliasNickname"]) event = Event( location_name=location_name, start_date=self._tz.localize(start_time), name=event_name, description="Committee Meeting Status: {}".format( item["CommitteeMeetingStatusName"]), ) event.add_source(url) event.add_committee(name=str(item["CommitteeName"]), id=item["CommitteeId"]) page_url = ("http://legis.delaware.gov/json/MeetingNotice/" "GetCommitteeMeetingItems?committeeMeetingId={}".format( item["CommitteeMeetingId"])) event.add_source(page_url) page_data = self.post(page_url).json()["Data"] for item in page_data: event.add_agenda_item(description=str(item["ItemDescription"])) event.add_person( name=str(item["PrimarySponsorShortName"]), id=str(item["PrimarySponsorPersonId"]), note="Sponsor", ) yield event
def scrape_lower_event(self, url): html = self.get(url).text if "not meeting" in html.lower(): self.info(f"Skipping {url}, not meeting") return page = lxml.html.fromstring(html) page.make_links_absolute(url) com = (page.xpath('//div[contains(@class,"sectionhead")]/h1') [0].text_content().strip()) com = f"House {com}" start = self.get_meeting_row(page, "Start Date") start = self.tz.localize(dateutil.parser.parse(start)) end = None if self.get_meeting_row(page, "End Date"): end = self.get_meeting_row(page, "End Date") end = self.tz.localize(dateutil.parser.parse(end)) location = self.get_meeting_row(page, "Location") summary = "" if page.xpath('//div[contains(text(),"Meeting Overview")]'): summary = (page.xpath( '//div[div[contains(text(),"Meeting Overview")]]/div[contains(@class,"ml-3")]' )[0].text_content().strip()) if end: event = Event( name=com, start_date=start, end_date=end, location_name=location, description=summary, ) else: event = Event(name=com, start_date=start, location_name=location, description=summary) event.add_source(url) for h5 in page.xpath( '//div[contains(@class,"meeting-actions-bills")]/h5'): event.add_agenda_item(h5.text_content().strip()) for agenda_item in h5.xpath("following-sibling::ul/li"): agenda_text = agenda_item.text_content().strip() agenda_text = re.sub(r"\s+\u2013\s+", " - ", agenda_text) item = event.add_agenda_item(agenda_text) found_bills = re.findall(r"H.*\s+\d+", agenda_text) if found_bills: item.add_bill(found_bills[0]) yield event
def scrape_upper(self): listing_url = "https://www.senate.mo.gov/hearingsschedule/hrings.htm" html = self.get(listing_url).text # The HTML here isn't wrapped in a container per-event # which makes xpath a pain. So string split by <hr> # then parse each event's fragment for cleaner results for fragment in html.split("<hr />")[1:]: page = lxml.html.fromstring(fragment) when_date = self.row_content(page, "Date:") when_time = self.row_content(page, "Time:") location = self.row_content(page, "Room:") location = "{}, {}".format( location, "201 W Capitol Ave, Jefferson City, MO 65101") # com = self.row_content(page, 'Committee:') com = page.xpath( '//td[descendant::b[contains(text(),"Committee")]]/a/text()' )[0] com = com.split(", Senator")[0].strip() start_date = self._TZ.localize( dateutil.parser.parse("{} {}".format(when_date, when_time))) event = Event(start_date=start_date, name=com, location_name=location) event.add_source(listing_url) event.add_participant(com, type="committee", note="host") for bill_table in page.xpath( '//table[@width="85%" and @border="0"]'): bill_link = "" if bill_table.xpath(self.bill_link_xpath): agenda_line = bill_table.xpath("string(tr[2])").strip() agenda_item = event.add_agenda_item( description=agenda_line) bill_link = bill_table.xpath( self.bill_link_xpath)[0].strip() agenda_item.add_bill(bill_link) else: agenda_line = bill_table.xpath("string(tr[1])").strip() agenda_item = event.add_agenda_item( description=agenda_line) yield event
def parse_div(self, row, chamber, com): cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0] # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip() title, location, start_date, end_date = self.parse_gcal(cal_link) event = Event(start_date=start_date, end_date=end_date, name=title, location_name=location) event.add_source( "http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx") for item in row.xpath('.//div[@class="col-xs-12a Item"]'): description = item.xpath("string(.)").strip() agenda = event.add_agenda_item(description=description) for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'): description = item.xpath("string(.)").strip() agenda = event.add_agenda_item(description=description) event.add_document( description, item.xpath("@href")[0], media_type="application/pdf", on_duplicate="ignore", ) for item in row.xpath('.//div[contains(@class,"ItemContainer")]' '[./div[@class="col-xs-1 Item"]]'): description = item.xpath("string(.)").strip() agenda = event.add_agenda_item(description=description) bill = item.xpath( './/div[@class="col-xs-1 Item"]/a/text()')[0].strip() agenda.add_bill(bill) video = row.xpath('.//a[./span[@class="OnDemand"]]') if video: event.add_media_link("Video of Hearing", video[0].xpath("@href")[0], "text/html") if "subcommittee" in title.lower(): subcom = title.split("-")[0].strip() event.add_participant(subcom, type="committee", note="host") else: event.add_participant(com, type="committee", note="host") yield event
def scrape_lower_event(self, url): page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) table = page.xpath('//section[@id="leg-agenda-mod"]/div/table')[0] meta = table.xpath("tr[1]/td[1]/text()") # careful, the committee name in the page #committee_div # is getting inserted via JS # so use the one from the table, and strip the chair name com_name = re.sub(r"\(.*\)", "", meta[0]) com_name = f"Assembly {com_name}" when = dateutil.parser.parse(meta[1]) when = self._tz.localize(when) location = meta[2] event = Event( name=com_name, start_date=when, location_name=location, ) event.add_participant(com_name, type="committee", note="host") event.add_source(url) if table.xpath('.//a[contains(@href, "/leg/")]'): agenda = event.add_agenda_item("Bills under Consideration") for bill_link in table.xpath('.//a[contains(@href, "/leg/")]'): agenda.add_bill(bill_link.text_content().strip()) yield event
def upper_parse_agenda_item(self, item): response = self.api_client.get( "meeting", year=item["agendaId"]["year"], agenda_id=item["agendaId"]["number"], committee=item["committeeId"]["name"], ) data = response["result"] chamber = data["committee"]["committeeId"]["chamber"].title() com_code = data["committee"]["committeeId"]["name"] com_name = f"{chamber} {com_code}" # each "meeting" is actually a listing page of multiple meetings of the same committee # broken out by different addendumId for addendum in data["committee"]["addenda"]["items"]: if addendum["addendumId"] != item["addendum"]: continue meeting = addendum["meeting"] when = dateutil.parser.parse(meeting["meetingDateTime"]) when = self._tz.localize(when) location = meeting["location"] description = meeting["notes"] if location == "": location = "See Committee Site" if "canceled" in description.lower(): continue event = Event( name=com_name, start_date=when, location_name=location, description=description, ) event.add_participant(com_name, type="committee", note="host") com_code = (com_code.lower().replace("'", "").replace(" ", "-").replace( ",", "")) url = f"https://www.nysenate.gov/committees/{com_code}" event.add_source(url) bills = addendum["bills"]["items"] if len(bills) > 0: agenda = event.add_agenda_item("Bills under consideration") for bill in bills: agenda.add_bill(bill["billId"]["printNo"]) yield event
def scrape_events(self, session, start_date): session_key = SESSION_KEYS[session] if start_date is None: start_date = datetime.date.today() else: start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") committees_by_code = {} committees_response = self.api_client.get("committees", session=session_key) for committee in committees_response: committees_by_code[ committee["CommitteeCode"]] = committee["CommitteeName"] meetings_response = self.api_client.get( "committee_meetings", start_date=start_date.strftime(self._DATE_FORMAT), session=session_key, ) if len(meetings_response) == 0: raise EmptyScrape for meeting in meetings_response: event_date = self._TZ.localize( datetime.datetime.strptime(meeting["MeetingDate"], self._DATE_FORMAT)) com_name = committees_by_code[meeting["CommitteeCode"]] event = Event(start_date=event_date, name=com_name, location_name=meeting["Location"]) event.add_source(meeting["AgendaUrl"]) event.extras["meeting_guid"] = meeting["MeetingGuid"] event.extras["committee_code"] = committee["CommitteeCode"] event.add_participant(com_name, type="committee", note="host") for row in meeting["CommitteeAgendaItems"]: if row["Comments"] is not None: agenda = event.add_agenda_item(row["Comments"]) if row["MeasureNumber"] is not None: bill_id = "{} {}".format(row["MeasurePrefix"], row["MeasureNumber"]) agenda.add_bill(bill_id) for row in meeting["CommitteeMeetingDocuments"]: event.add_document( note=row["ExhibitTitle"], url=row["DocumentUrl"], on_duplicate="ignore", ) yield event
def scrape_events(self, page): page = lxml.html.fromstring(page) if page.xpath( "//h3[contains(text(),'There are no hearings for the date range')]" ): raise EmptyScrape return for meeting in page.xpath('//div[@class="card mb-4"]'): com = meeting.xpath( 'div[contains(@class, "card-header")]/text()')[0].strip() details = meeting.xpath( 'div[contains(@class, "card-header")]/small/text()')[0].strip( ) (location, time) = details.split(" - ") # turn room numbers into the full address if location.lower().startswith("room"): location = "1445 K St, Lincoln, NE 68508, {}".format(location) day = meeting.xpath( "./preceding-sibling::h2[@class='text-center']/text()" )[-1].strip() # Thursday February 27, 2020 1:30 PM date = "{} {}".format(day, time) event_date = self._tz.localize( datetime.datetime.strptime(date, "%A %B %d, %Y %I:%M %p")) event = Event( name=com, start_date=event_date, classification="committee-meeting", description="Committee Meeting", location_name=location, ) event.add_committee(com, note="host") for row in meeting.xpath("div/table/tr"): if not row.xpath("td[3]"): continue agenda_desc = row.xpath("td[3]/text()")[0].strip() agenda_item = event.add_agenda_item(description=agenda_desc) if row.xpath("td[1]/a"): # bill link agenda_item.add_bill( row.xpath("td[1]/a/text()")[0].strip()) event.add_source( "https://nebraskalegislature.gov/calendar/calendar.php") yield event
def scrape_event_page(self, session, chamber, url, datetime): page = self.lxmlize(url) info = page.xpath("//p") metainfo = {} plaintext = "" for p in info: content = re.sub(r"\s+", " ", p.text_content()) plaintext += content + "\n" if ":" in content: key, val = content.split(":", 1) metainfo[key.strip()] = val.strip() committee = metainfo["COMMITTEE"] where = metainfo["PLACE"] if "CHAIR" in where: where, chair = where.split("CHAIR:") metainfo["PLACE"] = where.strip() metainfo["CHAIR"] = chair.strip() chair = None if "CHAIR" in metainfo: chair = metainfo["CHAIR"] plaintext = re.sub(r"\s+", " ", plaintext).strip() regexp = r"(S|J|H)(B|M|R) (\d+)" bills = re.findall(regexp, plaintext) event = Event( name=committee, start_date=self._tz.localize(datetime), location_name=where ) event.add_source(url) event.add_participant(committee, type="committee", note="host") if chair is not None: event.add_participant(chair, type="legislator", note="chair") for bill in bills: chamber, type, number = bill bill_id = "%s%s %s" % (chamber, type, number) item = event.add_agenda_item("Bill up for discussion") item.add_bill(bill_id) event.add_agenda_item(plaintext) yield event
def scrape(self): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath(".//a[contains(@title,'Committee Details')]") if len(comit_url) != 1: continue comit_url = comit_url[0] who = self.scrape_participants(comit_url.attrib["href"]) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib["href"] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") if cttie: cttie = cttie.replace("Committee on", "").strip() cttie = f"{chamber} {cttie}" name = cttie event = Event( name=name, location_name=where, start_date=self._tz.localize(when) ) event.add_source(calurl) event.add_committee(cttie, note="host") event.add_document("notice", notice, media_type="application/pdf") for entry in what: item = event.add_agenda_item(entry) if entry.startswith("AB") or entry.startswith("SB"): item.add_bill(entry) for thing in who: event.add_person(thing["name"]) yield event
def scrape_chamber(self, chamber): url = utils.urls["events"][chamber] page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for table in page.xpath( '//table[@class="CMS-MeetingDetail-CurrMeeting"]'): date_string = table.xpath( 'ancestor::div[@class="CMS-MeetingDetail"]/div/a/@name')[0] for row in table.xpath("tr"): time_string = row.xpath( 'td[@class="CMS-MeetingDetail-Time"]/text()')[0].strip() description = ( row.xpath('td[@class="CMS-MeetingDetail-Agenda"]/div/div') [-1].text_content().strip()) location = (row.xpath('td[@class="CMS-MeetingDetail-Location"]' )[0].text_content().strip()) committees = row.xpath( './/div[@class="CMS-MeetingDetail-Agenda-CommitteeName"]/a' ) bills = row.xpath('.//a[contains(@href, "billinfo")]') try: start_date = datetime.datetime.strptime( "{} {}".format(date_string, time_string), "%m/%d/%Y %I:%M %p") except ValueError: break event = Event( name=description, start_date=self._tz.localize(start_date), location_name=location, ) event.add_source(url) if bills or committees: item = event.add_agenda_item(description) for bill in bills: parsed = urllib.parse.urlparse(bill.get("href")) qs = urllib.parse.parse_qs(parsed.query) item.add_bill("{}{} {}".format(qs["body"], qs["type"], qs["bn"])) for committee in committees: parsed = urllib.parse.urlparse(committee.get("href")) qs = urllib.parse.parse_qs(parsed.query) item.add_committee( re.sub(r" \([S|H]\)$", "", committee.text), id=qs.get("Code"), ) yield event
def parse_event(self, row, chamber): # sample event available at http://www.akleg.gov/apptester.html committee_code = row.xpath("string(Sponsor)").strip() if committee_code in self.COMMITTEES[chamber]: committee_name = "{} {}".format( self.COMMITTEES_PRETTY[chamber], self.COMMITTEES[chamber][committee_code]["name"], ) else: committee_name = "{} {}".format( self.COMMITTEES_PRETTY[chamber], "MISCELLANEOUS", ) name = "{} {}".format(self.COMMITTEES_PRETTY[chamber], row.xpath("string(Title)").strip()) # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>" if name == "": name = committee_name location = row.xpath("string(Location)").strip() # events with no location all seem to be committee hearings if location == "": location = "Alaska State Capitol, 120 4th St, Juneau, AK 99801" start_date = dateutil.parser.parse(row.xpath("string(Schedule)")) # todo: do i need to self._TZ.localize() ? event = Event(start_date=start_date, name=name, location_name=location) event.add_source("http://w3.akleg.gov/index.php#tab4") if committee_code in self.COMMITTEES[chamber]: event.add_participant(committee_name, type="committee", note="host") for item in row.xpath("Agenda/Item"): agenda_desc = item.xpath("string(Text)").strip() if agenda_desc != "": agenda_item = event.add_agenda_item(description=agenda_desc) if item.xpath("BillRoot"): bill_id = item.xpath("string(BillRoot)") # AK Bill ids have a bunch of extra spaces bill_id = re.sub(r"\s+", " ", bill_id) agenda_item.add_bill(bill_id) yield event
def scrape_lower_item(self, page): # print(lxml.etree.tostring(page, pretty_print=True)) com = self.table_row_content(page, "Committee:") when_date = self.table_row_content(page, "Date:") when_time = self.table_row_content(page, "Time:") location = self.table_row_content(page, "Location:") if "house hearing room" in location.lower(): location = "{}, {}".format( location, "201 W Capitol Ave, Jefferson City, MO 65101") # fix some broken times, e.g. '12 :00' when_time = when_time.replace(" :", ":") # a.m. and p.m. seem to confuse dateutil.parser when_time = when_time.replace("A.M.", "AM").replace("P.M.", "PM") # some times have extra info after the AM/PM if "upon" in when_time: when_time = when_time.split("AM", 1)[0] when_time = when_time.split("PM", 1)[0] # fix '- Upcoming', '- In Progress' in dates when_date = re.sub(r"- (.*)", "", when_date).strip() try: start_date = dateutil.parser.parse(f"{when_date} {when_time}") except dateutil.parser._parser.ParserError: start_date = dateutil.parser.parse(when_date) start_date = self._TZ.localize(start_date) event = Event(start_date=start_date, name=com, location_name=location) event.add_source("https://house.mo.gov/HearingsTimeOrder.aspx") event.add_participant(com, type="committee", note="host") # different from general MO link xpath due to the <b> house_link_xpath = ('.//a[contains(@href, "Bill.aspx") ' 'or contains(@href, "bill.aspx")]/b/text()') for bill_title in page.xpath(house_link_xpath): bill_no = bill_title.split("--")[0].strip() bill_no = bill_no.replace("HCS", "").strip() agenda_item = event.add_agenda_item(description=bill_title) agenda_item.add_bill(bill_no) yield event
def scrape_chamber(self, chamber): grouped_hearings = defaultdict(list) for hearing in self.session.query(CACommitteeHearing): location = (self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description) date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {"Asm": "lower", "Sen": "upper"}[chamber_abbr] if event_chamber != chamber: continue grouped_hearings[(location, date)].append(hearing) for ((location, date), hearings) in grouped_hearings.items(): # Get list of bill_ids from the database. bill_ids = [hearing.bill_id for hearing in hearings] bills = [ "%s %s" % re.match(r"\d+([^\d]+)(\d+)", bill).groups() for bill in bill_ids ] # Dereference the committee_nr number and get display name. msg = "More than one committee meeting at (location, date) %r" msg = msg % ((location, date), ) assert len(set(hearing.committee_nr for hearing in hearings)) == 1, msg committee_name = _committee_nr[hearings.pop().committee_nr] desc = "Committee Meeting: " + committee_name event = Event(name=desc, start_date=date, location_name=committee_name) for bill_id in bills: if "B" in bill_id: type_ = "bill" else: type_ = "resolution" item = event.add_agenda_item("consideration") item.add_bill(bill_id, note=type_) event.add_person(committee_name + " Committee", note="host") event.add_source("https://downloads.leginfo.legislature.ca.gov/") yield event
def scrape(self, session=None): if session is None: session = self.latest_session() self.info("no session specified, using %s", session) year_abr = ((int(session) - 209) * 2) + 2000 self._init_mdb(year_abr) self.initialize_committees(year_abr) # Keep record of all events records = self.access_to_csv("Agendas") for record in records: if record["Status"] != "Scheduled": continue description = record["Comments"] related_bills = [] for bill in re.findall(r"(A|S)(-)?(\d{4})", description): related_bills.append({ "bill_id": "%s %s" % (bill[0], bill[2]), "descr": description }) date_time = "%s %s" % (record["Date"], record["Time"]) date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p") try: hr_name = self._committees[record["CommHouse"]] except KeyError: self.warning("unknown committee code %s, skipping", record["CommHouse"]) description = "Meeting of the {}".format(hr_name) event = Event( name=description, start_date=self._tz.localize(date_time), location_name=record["Location"] or "Statehouse", ) item = None for bill in related_bills: item = item or event.add_agenda_item(description) item.add_bill(bill["bill_id"]) # Add committee to event event.add_committee(hr_name, id=record["CommHouse"], note="host") event.add_source("http://www.njleg.state.nj.us/downloads.asp") yield event
def scrape_page(self, url, session, chamber): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) ctty_name = doc.xpath("//span[@class='heading']")[0].text_content() tables = doc.xpath("//table[@cellpadding='3']") info = tables[0] rows = info.xpath(".//tr") metainf = {} for row in rows: tds = row.xpath(".//td") key = tds[0].text_content().strip() value = tds[1].text_content().strip() metainf[key] = value where = metainf["Location:"] subject_matter = metainf["Subject Matter:"] description = "{}, {}".format(ctty_name, subject_matter) datetime = metainf["Scheduled Date:"] datetime = re.sub(r"\s+", " ", datetime) repl = {"AM": " AM", "PM": " PM"} # Space shim. for r in repl: datetime = datetime.replace(r, repl[r]) datetime = self.localize( dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")) event = Event(description, start_date=datetime, location_name=where) event.add_source(url) if ctty_name.startswith("Hearing Notice For"): ctty_name.replace("Hearing Notice For", "") event.add_participant(ctty_name, "organization") bills = tables[1] for bill in bills.xpath(".//tr")[1:]: tds = bill.xpath(".//td") if len(tds) < 4: continue # First, let's get the bill ID: bill_id = tds[0].text_content() agenda_item = event.add_agenda_item(bill_id) agenda_item.add_bill(bill_id) return event
def scrape_meeting_notice(self, item, url): # Since Event Name is not provided for all mettings. if "Joint" in str(item["CommitteeName"]): event_name = str(item["CommitteeName"]) else: event_name = "{} {}".format(str(item["CommitteeTypeName"]), str(item["CommitteeName"])) # 04/25/2012 03:00:00 PM fmt = "%m/%d/%y %I:%M %p" start_time = dt.datetime.strptime(str(item["MeetingDateTime"]), fmt) location_name = str(item["AddressAliasNickname"]) event = Event( location_name=location_name, start_date=self._tz.localize(start_time), name=event_name, description="Committee Meeting Status: {}".format( item["CommitteeMeetingStatusName"]), ) event.add_committee(name=str(item["CommitteeName"]), id=item["CommitteeId"]) html_url = f'https://legis.delaware.gov/MeetingNotice?committeeMeetingId={item["CommitteeMeetingId"]}' event.add_source(html_url) page_url = f'https://legis.delaware.gov/json/MeetingNotice/GetCommitteeMeetingItems?committeeMeetingId={item["CommitteeMeetingId"]}' page_data = [] try: page_data = self.post(page_url).json()["Data"] except json.decoder.JSONDecodeError: # No agenda items self.info(f"POST returned nothing on {page_url}") for item in page_data: a = event.add_agenda_item(description=str(item["ItemDescription"])) if item["LegislationDisplayText"] is not None: a.add_bill(item["LegislationDisplayText"]) event.add_person( name=str(item["PrimarySponsorShortName"]), id=str(item["PrimarySponsorPersonId"]), note="Sponsor", ) yield event
def scrape_senate(self): url = "https://www.senate.gov/general/committee_schedules/hearings.xml" page = self.get(url).content page = lxml.etree.fromstring(page) rows = page.xpath("//meeting") for row in rows: com = row.xpath("string(committee)") if com == "": continue com = "Senate {}".format(com) address = row.xpath("string(room)") parts = address.split("-") building_code = parts[0] if self.buildings.get(building_code): address = "{}, Room {}".format( self.buildings.get(building_code), parts[1]) agenda = row.xpath("string(matter)") try: event_date = datetime.datetime.strptime( row.xpath("string(date)"), "%d-%b-%Y %H:%M %p") except ValueError: event_date = datetime.datetime.strptime( row.xpath("string(date)"), "%d-%b-%Y") event_date = self._TZ.localize(event_date) event = Event(start_date=event_date, name=com, location_name=address) agenda_item = event.add_agenda_item(description=agenda) # ex: Business meeting to consider S.785, to improve mental... matches = re.findall(r"\s(\w+)\.(\d+),", agenda) if matches: match = matches[0] bill_type = match[0] bill_number = match[1] bill_name = "{} {}".format(bill_type, bill_number) agenda_item.add_bill(bill_name) event.add_participant( com, type="committee", note="host", ) event.add_source( "https://www.senate.gov/committees/hearings_meetings.htm") yield event
def scrape(self, chamber=None, session=None): url = "http://leg.colorado.gov/content/committees" if not session: session = self.latest_session() self.info("no session specified, using %s", session) chambers = [chamber] if chamber else ["upper", "lower"] for chamber in chambers: if chamber == "lower": xpath = ( '//div/h3[text()="House Committees of Reference"]/../' 'following-sibling::div[contains(@class,"view-content")]/' 'table//td//span[contains(@class,"field-content")]/a/@href' ) elif chamber == "upper": xpath = ( '//div/h3[text()="Senate Committees of Reference"]/../' 'following-sibling::div[contains(@class,"view-content")]/' 'table//td//span[contains(@class,"field-content")]/a/@href' ) elif chamber == "other": # All the links under the headers that don't contain "House" or "Senate" xpath = ( '//div/h3[not(contains(text(),"House")) and ' 'not(contains(text(),"Senate"))]/../' 'following-sibling::div[contains(@class,"view-content")]/' 'table//td//span[contains(@class,"field-content")]/a/@href' ) page = self.lxmlize(url) com_links = page.xpath(xpath) for link in com_links: page = self.lxmlize(link) hearing_links = page.xpath( '//div[contains(@class,"schedule-item-content")]' "/h4/a/@href") for link in hearing_links: try: page = self.lxmlize(link) title = page.xpath( '//header/h1[contains(@class,"node-title")]')[0] title = title.text_content().strip() date_day = page.xpath( '//div[contains(@class,"calendar-date")]')[0] date_day = date_day.text_content().strip() details = page.xpath( '//span[contains(@class, "calendar-details")]')[0] details = details.text_content().split("|") date_time = details[0].strip() location = details[1].strip() if "Upon Adjournment" in date_time: date = dt.datetime.strptime( date_day, "%A %B %d, %Y") else: date_str = "{} {}".format(date_day, date_time) date = dt.datetime.strptime( date_str, "%A %B %d, %Y %I:%M %p") agendas = [] # they overload the bills table w/ other agenda items. colspon=2 is agenda non_bills = page.xpath( '//td[@data-label="Hearing Item" and @colspan="2"]' ) for row in non_bills: content = row.text_content().strip() agendas.append(content) agenda = "\n".join(agendas) if agendas else "" event = Event( name=title, start_date=self._tz.localize(date), location_name=location, ) if agenda.strip(): event.add_agenda_item(agenda) event.add_source(link) bills = page.xpath( '//td[@data-label="Hearing Item"]/a') for bill in bills: bill_id = bill.text_content().strip() item = event.add_agenda_item("hearing item") item.add_bill(bill_id) yield event except Exception: # TODO: this is awful pass
def scrape_event_page(self, url, chamber): html = self.get(url).text page = lxml.html.fromstring(html) trs = page.xpath( "//table[@id='frg_mcommitteemeeting_MeetingTable']/tr") metainf = {} for tr in trs: tds = tr.xpath(".//td") if len(tds) <= 1: continue key = tds[0].text_content().strip() val = tds[1] metainf[key] = {"txt": val.text_content().strip(), "obj": val} if metainf == {}: return # Wednesday, 5/16/2012 3:00 pm datetime = "%s %s" % ( metainf["Date"]["txt"], metainf["Time"]["txt"].replace(".", ""), ) if "Cancelled" in datetime: return translate = { "noon": " PM", "a.m.": " AM", "am": " AM", # This is due to a nasty line they had. "a.m": "AM", # another weird one } for t in translate: if t in datetime: datetime = datetime.replace(t, translate[t]) datetime = re.sub(r"\s+", " ", datetime) for text_to_remove in [ "or after committees are given leave", "or later immediately after committees are given leave", "or later after committees are given leave by the House to meet", "**Please note time**", ]: datetime = datetime.split(text_to_remove)[0].strip() datetime = datetime.replace("p.m.", "pm") datetime = datetime.replace("Noon", "pm") try: datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p") except ValueError: datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I %p") where = metainf["Location"]["txt"] title = metainf["Committee(s)"]["txt"] # XXX: Find a better title if chamber == "other": chamber = "joint" event = Event(name=title, start_date=self._tz.localize(datetime), location_name=where) event.add_source(url) event.add_source(mi_events) chair_name = metainf["Chair"]["txt"].strip() if chair_name: event.add_participant(chair_name, type="legislator", note="chair") else: self.warning("No chair found for event '{}'".format(title)) event.add_participant(metainf["Committee(s)"]["txt"], type="committee", note="host") agenda = metainf["Agenda"]["obj"] agendas = agenda.text_content().split("\r") related_bills = agenda.xpath("//a[contains(@href, 'getObject')]") for bill in related_bills: description = agenda for a in agendas: if bill.text_content() in a: description = a item = event.add_agenda_item(description) item.add_bill(bill.text_content()) yield event
def scrape(self): url = "https://www.ncleg.gov/LegislativeCalendar/" page = self.lxmlize(url) page.make_links_absolute(url) for day_row in page.xpath('//div[@class="row cal-event-day"]'): date = day_row.xpath( './/div[contains(@class, "cal-event-day-full")]/text()' )[0].strip() for row in day_row.xpath( './/div[contains(@class, "cal-event row")]'): # first cal-event-row sometimes contains full date, skip that time = row.xpath( 'div[contains(@class,"col-12 text-left col-sm-3 text-sm-right")]/text()' )[0].strip() event_row = row.xpath( 'div[contains(@class,"col-12 col-sm-9 col-md-12 ")]')[0] # skip floor sessions if event_row.xpath( './/a[contains(text(), "Session Convenes")]'): continue chamber = "" if len( event_row.xpath( 'span[contains(@class, "text-dark font-weight-bold")]/text()' )): chamber = event_row.xpath( 'span[contains(@class, "text-dark font-weight-bold")]/text()' )[0].strip() chamber = chamber.replace(":", "") # sometimes there are unlinked events, usually just press conferences if not event_row.xpath('a[contains(@href,"/Committees/")]'): continue com_link = event_row.xpath( 'a[contains(@href,"/Committees/")]')[0] com_name = com_link.text_content().strip() com_name = f"{chamber} {com_name}".strip() com_url = com_link.xpath("@href")[0] where = (row.xpath('div[contains(@class,"col-12 offset-sm-3")]' )[0].text_content().strip()) where = where.replace("STREAM", "") when = f"{date} {time}" try: when = dateutil.parser.parse(when) # occasionally they'd do 9am-1pm which confuses the TZ detection when = self._tz.localize(when) except (ParserError, ValueError): self.warning( f"Unable to parse {time}, only using day component") when = dateutil.parser.parse(date) when = self._tz.localize(when).date() event = Event( name=com_name, start_date=when, location_name=where, classification="committee-meeting", ) event.add_source(com_url) event.add_participant(com_name, type="committee", note="host") # NOTE: if you follow the committee link, there are agenda PDF links # but they don't load at all as of 2021-02-01 -- showerst for agenda_row in event_row.xpath(".//p"): agenda_text = agenda_row.text_content().strip() if agenda_text != "": agenda = event.add_agenda_item(agenda_text) for bill_row in agenda_row.xpath( './/a[contains(@href,"BillLookUp")]/text()'): agenda.add_bill(bill_row.split(":")[0]) yield event
def scrape_meeting_page(self, url): page = self.lxmlize(url) page.make_links_absolute(url) if page.xpath('//div[text()="Error"]'): return if not page.xpath('//div[@id="wrapleftcol"]/h3'): return com = page.xpath('//div[@id="wrapleftcol"]/h3[1]/text()')[0].strip() when = page.xpath('//div[@id="wrapleftcol"]/h1[1]/text()')[0].strip() if "time to be announced" in when.lower() or "tba" in when.lower(): when = re.sub("time to be announced", "", when, flags=re.IGNORECASE) when = re.sub("TBA", "", when, flags=re.IGNORECASE) when = re.sub(r"or\s+conclusion\s+(.*)", "", when, flags=re.IGNORECASE) when = when.split("-")[0] when = self.clean_date(when) when = dateutil.parser.parse(when) when = self._tz.localize(when) # we check for this elsewhere, but just in case the very first event on a committee page is way in the past if when.year < datetime.datetime.today().year: return where = page.xpath( '//div[@id="wrapleftcol"]/*[contains(text(), "Location")]/text()' )[0].strip() desc = (page.xpath('//div[@id="wrapleftcol"]/blockquote[1]') [0].text_content().strip()) event = Event( name=com, start_date=when, location_name=where, classification="committee-meeting", description=desc, ) for row in page.xpath('//div[@id="wrapleftcol"]/blockquote[1]/p'): if row.text_content().strip() != "": agenda = event.add_agenda_item(row.text_content().strip()) for bill in re.findall(self.bill_regex, row.text_content()): bill_id = re.sub(r"\.\s*", "", bill[0], flags=re.IGNORECASE) bill_id = re.sub(r"house bill", "HB", bill_id, flags=re.IGNORECASE) bill_id = re.sub(r"senate bill", "SB", bill_id, flags=re.IGNORECASE) agenda.add_bill(bill_id) event.add_source(url) yield event
def scrape(self, session=None, start=None, end=None): if session is None: session = self.latest_session() self.info("no session specified, using %s", session) # testimony url, we'll need it later in a loop # testmony query looks gnary but breaks down to: # $filter: (Request/PaperNumber eq 'SP0219') and (Request/Legislature eq 129) # $orderby: LastName,FirstName,Organization # $expand: Request # $select: Id,FileType,NamePrefix,FirstName,LastName,Organization, # PresentedDate,FileSize,Topic testimony_url_base = ( "http://legislature.maine.gov/backend/" "breeze/data/CommitteeTestimony?" "$filter=(Request%2FPaperNumber%20eq%20%27{}%27)%20and" "%20(Request%2FLegislature%20eq%20{})" "&$orderby=LastName%2CFirstName%2COrganization&" "$expand=Request&$select=Id%2CFileType%2CNamePrefix" "%2CFirstName%2CLastName%2COrganization%2CPresentedDate%2CFileSize%2CTopic" ) if start is None: start_date = datetime.datetime.now().isoformat() else: start_date = datetime.datetime.strptime(start, "%Y-%m-%d") start_date = start_date.isoformat() # default to 30 days if no end if end is None: dtdelta = datetime.timedelta(days=30) end_date = datetime.datetime.now() + dtdelta end_date = end_date.isoformat() else: end_date = datetime.datetime.strptime(end, "%Y-%m-%d") end_date = end_date.isoformat() bills_by_event = {} bills_url = ("http://legislature.maine.gov/backend/breeze/data/" "getCalendarEventsBills?startDate={}&endDate={}") bills_url = bills_url.format(start_date, end_date) page = json.loads(self.get(bills_url).content) for row in page: bills_by_event.setdefault(row["EventId"], []) bills_by_event[row["EventId"]].append(row) # http://legislature.maine.gov/backend/breeze/data/getCalendarEventsRaw?startDate=2019-03-01T05%3A00%3A00.000Z&endDate=2019-04-01T03%3A59%3A59.999Z&OnlyPHWS=false url = ("http://legislature.maine.gov/backend/breeze/data/" "getCalendarEventsRaw?startDate={}&endDate={}&OnlyPHWS=true") url = url.format(start_date, end_date) page = json.loads(self.get(url).content) for row in page: if row["Cancelled"] is True or row["Postponed"] is True: continue start_date = self._TZ.localize( dateutil.parser.parse(row["FromDateTime"])) end_date = self._TZ.localize( dateutil.parser.parse(row["ToDateTime"])) name = row["CommitteeName"] if name is None: name = row["Host"] address = row["Location"] address = address.replace( "Cross Building", "Cross Office Building, 111 Sewall St, Augusta, ME 04330", ) address = address.replace( "State House", "Maine State House, 210 State St, Augusta, ME 04330") event = Event( start_date=start_date, end_date=end_date, name=name, location_name=address, ) event.add_source( "http://legislature.maine.gov/committee/#Committees/{}".format( row["CommitteeCode"])) if bills_by_event.get(row["Id"]): for bill in bills_by_event[row["Id"]]: description = "LD {}: {}".format(bill["LD"], bill["Title"]) agenda = event.add_agenda_item(description=description) agenda.add_bill("LD {}".format(bill["LD"])) if bill["TestimonyCount"] > 0: test_url = testimony_url_base.format( bill["PaperNumber"], session) test_page = json.loads(self.get(test_url).content) for test in test_page: title = "{} {} - {}".format( test["FirstName"], test["LastName"], test["Organization"], ) if test["NamePrefix"] is not None: title = "{} {}".format(test["NamePrefix"], title) test_url = ( "http://legislature.maine.gov/backend/app/services" "/getDocument.aspx?doctype=test&documentId={}". format(test["Id"])) if test["FileType"] == "pdf": media_type = "application/pdf" event.add_document(note=title, url=test_url, media_type=media_type) yield event
def scrape_lower(self): url = "https://www.house.leg.state.mn.us/Schedules/All" page = self.lxmlize(url) for row in page.xpath('//div[contains(@class,"my-2 d-print-block")]'): # print(row.text_content()) # skip floor sessions and unlinked events if not row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b' ): continue # skip joint ones, we'll get those from the senate API if row.xpath('div[contains(@class,"card-header bg-joint")]'): continue # top-level committee com = row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()' )[0].strip() com_link = row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/@href' )[0] when = (row.xpath( 'div[contains(@class,"card-header")]/span[contains(@class,"text-white")]/text()' )[0].replace("\r\n", "").strip()) when = dateutil.parser.parse(when) when = self._tz.localize(when) if row.xpath('.//b[.="Location:"]'): where = row.xpath( './/b[.="Location:"]/following-sibling::text()[1]' )[0].strip() else: where = "See committee page" if row.xpath('.//b[.="Agenda:"]'): desc = "\n".join( row.xpath('.//b[.="Agenda:"]/following-sibling::div/text()' )).strip() else: desc = "See committee page" event = Event( name=com, start_date=when, location_name=where, classification="committee-meeting", description=desc, ) event.add_source(com_link) for bill in get_bill_ids(desc): event.add_bill(desc) if row.xpath( ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]" ): agenda = event.add_agenda_item("Bills") for bill_id in row.xpath( ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]/text()" ): agenda.add_bill(bill_id.strip()) for attachment in row.xpath(".//ul/li/div/a"): doc_url = attachment.xpath("@href")[0] doc_name = attachment.xpath("text()")[0].strip() # if they don't provide a name just use the filename if doc_name == "": parsed_url = urlparse(doc_url) doc_name = os.path.basename(parsed_url) # sometimes broken links to .msg files (emails?) are attached, # they always 404. if doc_url.endswith(".msg"): continue media_type = get_media_type(doc_url) event.add_document(doc_name, doc_url, media_type=media_type, on_duplicate="ignore") for committee in row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()' ): event.add_participant(committee, type="committee", note="host") yield event
def scrape_committee_page(self, url): page = self.get(url, headers=self.cf_headers).content page = lxml.html.fromstring(page) page.make_links_absolute(url) com = page.xpath( '//div[contains(@class, "pull-left span8")]/h1/text()')[0].strip() for row in page.xpath('//div[contains(@id, "agenda-item")]'): # status = "tentative" meta = row.xpath( 'div[contains(@class,"accordion-heading-agenda")]/a')[0] date = meta.xpath("text()")[0].strip() time_and_loc = meta.xpath("span/text()")[0].strip() time_and_loc = time_and_loc.split("\n") time = time_and_loc[0] loc = time_and_loc[1] if loc == "": loc = "See Agenda" com = com.replace("(S)", "Senate").replace("(H)", "House") # Indiana has a LOT of undefined times, stuff like "15 mins after adj. of elections" # so just remove the time component if it won't parse, and the user can go to the agenda try: when = dateutil.parser.parse(f"{date} {time}") except dateutil.parser._parser.ParserError: when = dateutil.parser.parse(date) when = self._tz.localize(when) if "cancelled" in time.lower(): continue event = Event( name=com, start_date=when, location_name=loc, classification="committee-meeting", ) event.add_source(url) event.add_participant(com, type="committee", note="host") if row.xpath('.//a[contains(text(), "View Agenda")]'): agenda_url = row.xpath( './/a[contains(text(), "View Agenda")]/@href')[0] event.add_document("Agenda", agenda_url, media_type="application/pdf") if row.xpath('.//a[contains(text(), "Watch")]'): vid_url = row.xpath('.//a[contains(text(), "Watch")]/@href')[0] event.add_media_link("Video of Hearing", vid_url, media_type="text/html") if row.xpath('.//tr[contains(@class,"bill-container")]/td'): agenda = event.add_agenda_item("Bills under consideration") for bill_row in row.xpath( './/tr[contains(@class,"bill-container")]'): bill_id = bill_row.xpath( ".//a[contains(@class,'bill-name-link')]/text()")[0] agenda.add_bill(bill_id) yield event
def house_meeting(self, xml, source_url): title = xml.xpath("string(//meeting-details/meeting-title)") meeting_date = xml.xpath("string(//meeting-date/calendar-date)") start_time = xml.xpath("string(//meeting-date/start-time)") end_time = xml.xpath("string(//meeting-date/end-time)") start_dt = datetime.datetime.strptime( "{} {}".format(meeting_date, start_time), "%Y-%m-%d %H:%M:%S") start_dt = self._TZ.localize(start_dt) end_dt = None if end_time != "": end_dt = datetime.datetime.strptime( "{} {}".format(meeting_date, end_time), "%Y-%m-%d %H:%M:%S") end_dt = self._TZ.localize(end_dt) building = xml.xpath( "string(//meeting-details/meeting-location/capitol-complex/building)" ) address = "US Capitol" if building != "Select one": if self.buildings.get(building): building = self.buildings.get(building) room = xml.xpath( "string(//meeting-details/meeting-location/capitol-complex/room)" ) address = "{}, Room {}".format(building, room) event = Event(start_date=start_dt, name=title, location_name=address) event.add_source(source_url) coms = xml.xpath( "//committees/committee-name | //subcommittees/committee-name") for com in coms: com_name = com.xpath("string(.)") com_name = "House {}".format(com_name) event.add_participant( com_name, type="committee", note="host", ) docs = xml.xpath("//meeting-documents/meeting-document") for doc in docs: doc_name = doc.xpath("string(description)") doc_files = doc.xpath("files/file") for doc_file in doc_files: media_type = self.media_types[doc_file.get("doc-type")] url = doc_file.get("doc-url") if doc.get("type") in ["BR", "AM", "CA"]: if doc_name == "": doc_name = doc.xpath("string(legis-num)").strip() matches = re.findall(r"([\w|\.]+)\s+(\d+)", doc_name) if matches: match = matches[0] bill_type = match[0].replace(".", "") bill_number = match[1] bill_name = "{} {}".format(bill_type, bill_number) agenda = event.add_agenda_item(description=bill_name) agenda.add_bill(bill_name) if doc_name == "": try: doc_name = self.hearing_document_types[doc.get("type")] except KeyError: self.warning("Unable to find document type: {}".format( doc.get("type"))) event.add_document(doc_name, url, media_type=media_type, on_duplicate="ignore") yield event
def scrape(self): get_short_codes(self) page = self.lxmlize(URL) if page.xpath("//td[contains(string(.),'No Hearings')]"): raise EmptyScrape table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() # Multi-committee events will be CODE1/CODE2/CODE3 if "/" in committee: coms = committee.split("/") com_names = [] for com in coms: com_names.append("{} {}".format( self.chambers[self.short_ids[com]["chamber"]], self.short_ids[com]["name"], )) descr = ", ".join(com_names) elif self.short_ids.get(committee): descr = "{} {}".format( self.chambers[self.short_ids[committee]["chamber"]], self.short_ids[committee]["name"], ) else: descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0].replace(".", "").strip() when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib["href"] notice_name = notice.text # the listing page shows the same hearing in multiple rows. # combine these -- get_related_bills() will take care of adding the bills # and descriptions if notice_href in self.seen_hearings: continue else: self.seen_hearings.append(notice_href) when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") when = TIMEZONE.localize(when) event = Event( name=descr, start_date=when, classification="committee-meeting", description=descr, location_name=where, ) if "/" in committee: committees = committee.split("/") else: committees = [committee] for committee in committees: if "INFO" not in committee and committee in self.short_ids: committee = "{} {}".format( self.chambers[self.short_ids[committee]["chamber"]], self.short_ids[committee]["name"], ) event.add_committee(committee, note="host") event.add_source(URL) event.add_document(notice_name, notice_href, media_type="text/html") for bill in self.get_related_bills(notice_href): a = event.add_agenda_item(description=bill["descr"].strip()) bill["bill_id"] = bill["bill_id"].split(",")[0] a.add_bill(bill["bill_id"], note=bill["type"]) yield event
def scrape_chamber(self, chamber): if chamber == "upper": url = "https://legislature.idaho.gov/sessioninfo/agenda/sagenda/" elif chamber == "lower": url = "https://legislature.idaho.gov/sessioninfo/agenda/hagenda/" page = self.get(url).content page = lxml.html.fromstring(page) for row in page.xpath('//div[@id="ai1ec-container"]/div'): month = row.xpath( ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'date')]/text()" )[0].strip() day = row.xpath( ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'date')]/span/text()" )[0].strip() time_and_loc = row.xpath( ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'abbr')]/h2/text()" ) time = time_and_loc[0].strip() loc = time_and_loc[1].strip() if "not meet" in time.lower(): continue try: start = dateutil.parser.parse(f"{month} {day} {time}") except dateutil.parser._parser.ParserError: start = dateutil.parser.parse(f"{month} {day}") start = self._tz.localize(start) com = row.xpath( ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'day')]/h2/a/text()" )[0].strip() event = Event( name=com, start_date=start, location_name=loc, classification="committee-meeting", ) event.add_participant(com, type="committee", note="host") agenda_url = row.xpath( './/a[contains(text(), "Full Agenda")]/@href')[0] event.add_document("Agenda", agenda_url, media_type="application/pdf") agenda_rows = row.xpath( './/div[contains(@class,"card")]/div[contains(@id, "Agenda")]/div/table/tbody/tr' )[1:] for agenda_row in agenda_rows: subject = agenda_row.xpath("string(td[1])").strip() description = agenda_row.xpath("string(td[2])").strip() presenter = agenda_row.xpath("string(td[3])").strip() if presenter != "": agenda_text = ( f"{subject} {description} Presenter: {presenter}". strip()) event.add_participant(agenda_text, type="person", note="Presenter") else: agenda_text = f"{subject} {description}".strip() agenda = event.add_agenda_item(agenda_text) if agenda_row.xpath( 'td[1]/a[contains(@href,"/legislation/")]'): agenda.add_bill( agenda_row.xpath( 'td[1]/a[contains(@href,"/legislation/")]/text()') [0].strip()) event.add_source(url) yield event
def scrape_upper(self): url = "https://www.senate.mn/api/schedule/upcoming" data = self.get(url).json() for row in data["events"]: com = row["committee"]["committee_name"] start = dateutil.parser.parse(row["hearing_start"]) start = self._tz.localize(start) if (row["hearing_room"] and "hearing_building" in row and row["hearing_building"]): where = f"{row['hearing_building']} {row['hearing_room']}" elif "hearing_building" in row and row["hearing_building"]: where = row["hearing_building"] else: where = "TBD" description = "" if "hearing_notes" in row and row["hearing_notes"]: description = row["hearing_notes"] event = Event( name=com, location_name=where, start_date=start, classification="committee-meeting", description=description, ) for bill in get_bill_ids(description): event.add_bill(description) if "lrl_schedule_link" in row: event.add_source(row["lrl_schedule_link"]) else: if "link" in row["committee"]: if row["committee"]["link"].startswith("http"): event.add_source(row["committee"]["link"]) elif row["committee"]["link"].startswith("www"): event.add_source(f"http://{row['committee']['link']}") else: event.add_source( f"https://www.senate.mn/{row['committee']['link']}" ) elif "senate_chair_link" in row["committee"]: event.add_source( f"https://www.senate.mn/{row['committee']['senate_chair_link']}" ) if "agenda" in row: for agenda_row in row["agenda"]: if (agenda_row["description"] is None or agenda_row["description"].strip() == ""): # sometimes they have blank agendas but bills or files agenda_row["description"] = "Agenda" agenda = event.add_agenda_item(agenda_row["description"]) if "bill_type" in agenda_row: agenda.add_bill("{} {}".format( agenda_row["bill_type"].replace(".", ""), agenda_row["bill_number"], )) if "files" in agenda_row: for file_row in agenda_row["files"]: doc_name = file_row["filename"] doc_url = file_row["file_path"] # if they don't provide a name just use the filename if doc_name == "": parsed_url = urlparse(doc_url) doc_name = os.path.basename(parsed_url.path) event.add_document( doc_name, f"https://www.senate.mn/{doc_url}", media_type="text/html", on_duplicate="ignore", ) if "video_link" in row: event.add_media_link("Video", row["video_link"], "text/html") if "audio_link" in row: event.add_media_link("Audio", row["audio_link"], "text/html") yield event