def scrape_meeting_notice(self, chamber, item, url): # Since Event Name is not provided for all mettings. event_name = str(item["CommitteeName"]) # 04/25/2012 03:00:00 PM fmt = "%m/%d/%y %I:%M %p" start_time = dt.datetime.strptime(str(item["MeetingDateTime"]), fmt) location_name = str(item["AddressAliasNickname"]) event = Event( location_name=location_name, start_date=self._tz.localize(start_time), name=event_name, description="Committee Meeting Status: {}".format( item["CommitteeMeetingStatusName"]), ) event.add_source(url) event.add_committee(name=str(item["CommitteeName"]), id=item["CommitteeId"]) page_url = ("http://legis.delaware.gov/json/MeetingNotice/" "GetCommitteeMeetingItems?committeeMeetingId={}".format( item["CommitteeMeetingId"])) event.add_source(page_url) page_data = self.post(page_url).json()["Data"] for item in page_data: event.add_agenda_item(description=str(item["ItemDescription"])) event.add_person( name=str(item["PrimarySponsorShortName"]), id=str(item["PrimarySponsorPersonId"]), note="Sponsor", ) yield event
def scrape_events(self, page): page = lxml.html.fromstring(page) if page.xpath( "//h3[contains(text(),'There are no hearings for the date range')]" ): raise EmptyScrape return for meeting in page.xpath('//div[@class="card mb-4"]'): com = meeting.xpath( 'div[contains(@class, "card-header")]/text()')[0].strip() details = meeting.xpath( 'div[contains(@class, "card-header")]/small/text()')[0].strip( ) (location, time) = details.split(" - ") # turn room numbers into the full address if location.lower().startswith("room"): location = "1445 K St, Lincoln, NE 68508, {}".format(location) day = meeting.xpath( "./preceding-sibling::h2[@class='text-center']/text()" )[-1].strip() # Thursday February 27, 2020 1:30 PM date = "{} {}".format(day, time) event_date = self._tz.localize( datetime.datetime.strptime(date, "%A %B %d, %Y %I:%M %p")) event = Event( name=com, start_date=event_date, classification="committee-meeting", description="Committee Meeting", location_name=location, ) event.add_committee(com, note="host") for row in meeting.xpath("div/table/tr"): if not row.xpath("td[3]"): continue agenda_desc = row.xpath("td[3]/text()")[0].strip() agenda_item = event.add_agenda_item(description=agenda_desc) if row.xpath("td[1]/a"): # bill link agenda_item.add_bill( row.xpath("td[1]/a/text()")[0].strip()) event.add_source( "https://nebraskalegislature.gov/calendar/calendar.php") yield event
def scrape(self): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath(".//a[contains(@title,'Committee Details')]") if len(comit_url) != 1: continue comit_url = comit_url[0] who = self.scrape_participants(comit_url.attrib["href"]) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib["href"] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") if cttie: cttie = cttie.replace("Committee on", "").strip() cttie = f"{chamber} {cttie}" name = cttie event = Event( name=name, location_name=where, start_date=self._tz.localize(when) ) event.add_source(calurl) event.add_committee(cttie, note="host") event.add_document("notice", notice, media_type="application/pdf") for entry in what: item = event.add_agenda_item(entry) if entry.startswith("AB") or entry.startswith("SB"): item.add_bill(entry) for thing in who: event.add_person(thing["name"]) yield event
def scrape(self, session=None): if session is None: session = self.latest_session() self.info("no session specified, using %s", session) year_abr = ((int(session) - 209) * 2) + 2000 self._init_mdb(year_abr) self.initialize_committees(year_abr) # Keep record of all events records = self.access_to_csv("Agendas") for record in records: if record["Status"] != "Scheduled": continue description = record["Comments"] related_bills = [] for bill in re.findall(r"(A|S)(-)?(\d{4})", description): related_bills.append({ "bill_id": "%s %s" % (bill[0], bill[2]), "descr": description }) date_time = "%s %s" % (record["Date"], record["Time"]) date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p") try: hr_name = self._committees[record["CommHouse"]] except KeyError: self.warning("unknown committee code %s, skipping", record["CommHouse"]) description = "Meeting of the {}".format(hr_name) event = Event( name=description, start_date=self._tz.localize(date_time), location_name=record["Location"] or "Statehouse", ) item = None for bill in related_bills: item = item or event.add_agenda_item(description) item.add_bill(bill["bill_id"]) # Add committee to event event.add_committee(hr_name, id=record["CommHouse"], note="host") event.add_source("http://www.njleg.state.nj.us/downloads.asp") yield event
def scrape_meeting_notice(self, item, url): # Since Event Name is not provided for all mettings. if "Joint" in str(item["CommitteeName"]): event_name = str(item["CommitteeName"]) else: event_name = "{} {}".format(str(item["CommitteeTypeName"]), str(item["CommitteeName"])) # 04/25/2012 03:00:00 PM fmt = "%m/%d/%y %I:%M %p" start_time = dt.datetime.strptime(str(item["MeetingDateTime"]), fmt) location_name = str(item["AddressAliasNickname"]) event = Event( location_name=location_name, start_date=self._tz.localize(start_time), name=event_name, description="Committee Meeting Status: {}".format( item["CommitteeMeetingStatusName"]), ) event.add_committee(name=str(item["CommitteeName"]), id=item["CommitteeId"]) html_url = f'https://legis.delaware.gov/MeetingNotice?committeeMeetingId={item["CommitteeMeetingId"]}' event.add_source(html_url) page_url = f'https://legis.delaware.gov/json/MeetingNotice/GetCommitteeMeetingItems?committeeMeetingId={item["CommitteeMeetingId"]}' page_data = [] try: page_data = self.post(page_url).json()["Data"] except json.decoder.JSONDecodeError: # No agenda items self.info(f"POST returned nothing on {page_url}") for item in page_data: a = event.add_agenda_item(description=str(item["ItemDescription"])) if item["LegislationDisplayText"] is not None: a.add_bill(item["LegislationDisplayText"]) event.add_person( name=str(item["PrimarySponsorShortName"]), id=str(item["PrimarySponsorPersonId"]), note="Sponsor", ) yield event
def scrape(self, session=None): if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) url = "http://legislature.vermont.gov/committee/loadAllMeetings/{}".format( year_slug) json_data = self.get(url).text events = json.loads(json_data)["data"] for info in events: # Determine when the committee meets if (info["TimeSlot"] == "" or info["TimeSlot"] == "1" or info["TimeSlot"] == 1): start_time = datetime.datetime.strptime( info["MeetingDate"], "%A, %B %d, %Y") all_day = True else: try: start_time = datetime.datetime.strptime( info["MeetingDate"] + ", " + info["TimeSlot"], "%A, %B %d, %Y, %I:%M %p", ) except ValueError: start_time = datetime.datetime.strptime( info["MeetingDate"] + ", " + info["StartTime"], "%A, %B %d, %Y, %I:%M %p", ) all_day = False event = Event( start_date=self.TIMEZONE.localize(start_time), all_day=all_day, name="Meeting of the {}".format(info["LongName"]), description="committee meeting", location_name="{0}, Room {1}".format(info["BuildingName"], info["RoomNbr"]), ) event.add_source(url) event.add_committee(name=info["LongName"], note="host") yield event
def scrape(self, session=None): year_slug = self.jurisdiction.get_year_slug(session) url = "http://legislature.vermont.gov/committee/loadAllMeetings/{}".format( year_slug) json_data = self.get(url).text events = json.loads(json_data)["data"] for info in events: # Determine when the committee meets if (info["TimeSlot"] == "" or info["TimeSlot"] == "1" or info["TimeSlot"] == 1): start_time = dateutil.parser.parse(info["MeetingDate"]) all_day = True else: try: start_time = dateutil.parser.parse( f"{info['MeetingDate']}, {info['TimeSlot']}") except ParserError: start_time = dateutil.parser.parse(info["MeetingDate"]) all_day = False event = Event( start_date=self.TIMEZONE.localize(start_time), all_day=all_day, name="Meeting of the {}".format(info["LongName"]), description="committee meeting", location_name="{0}, Room {1}".format(info["BuildingName"], info["RoomNbr"]), ) event.add_source(url) event.add_committee(name=info["LongName"], note="host") yield event
def scrape(self): get_short_codes(self) page = self.lxmlize(URL) if page.xpath("//td[contains(string(.),'No Hearings')]"): raise EmptyScrape table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() # Multi-committee events will be CODE1/CODE2/CODE3 if "/" in committee: coms = committee.split("/") com_names = [] for com in coms: com_names.append("{} {}".format( self.chambers[self.short_ids[com]["chamber"]], self.short_ids[com]["name"], )) descr = ", ".join(com_names) elif self.short_ids.get(committee): descr = "{} {}".format( self.chambers[self.short_ids[committee]["chamber"]], self.short_ids[committee]["name"], ) else: descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0].replace(".", "").strip() when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib["href"] notice_name = notice.text # the listing page shows the same hearing in multiple rows. # combine these -- get_related_bills() will take care of adding the bills # and descriptions if notice_href in self.seen_hearings: continue else: self.seen_hearings.append(notice_href) when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") when = TIMEZONE.localize(when) event = Event( name=descr, start_date=when, classification="committee-meeting", description=descr, location_name=where, ) if "/" in committee: committees = committee.split("/") else: committees = [committee] for committee in committees: if "INFO" not in committee and committee in self.short_ids: committee = "{} {}".format( self.chambers[self.short_ids[committee]["chamber"]], self.short_ids[committee]["name"], ) event.add_committee(committee, note="host") event.add_source(URL) event.add_document(notice_name, notice_href, media_type="text/html") for bill in self.get_related_bills(notice_href): a = event.add_agenda_item(description=bill["descr"].strip()) bill["bill_id"] = bill["bill_id"].split(",")[0] a.add_bill(bill["bill_id"], note=bill["type"]) yield event
def scrape_chamber(self, chamber=None): # If chamber is None, don't exclude any events from the results based on chamber chmbr = cal_chamber_text.get(chamber) tables = url_xpath(cal_weekly_events, "//table[@class='date-table']") for table in tables: date = table.xpath("../.")[0].getprevious().text_content() trs = table.xpath("./tr") for tr in trs: order = [ "time", "chamber", "type", "agenda", "location", "video" ] tds = tr.xpath("./td") metainf = {} if not tds: continue for el in range(0, len(order)): metainf[order[el]] = tds[el] if chmbr and metainf["chamber"].text_content() != chmbr: self.info("Skipping event based on chamber.") continue time = metainf["time"].text_content() datetime_string = "%s %s" % (date.strip(" \r\n"), time.strip(" \r\n")) location = metainf["location"].text_content() description = metainf["type"].text_content() dtfmt = "%A, %B %d, %Y %I:%M %p" dtfmt_no_time = "%A, %B %d, %Y" if time == "Cancelled": self.log("Skipping cancelled event.") continue else: if "Immediately follows H-FLOOR" in datetime_string: continue if " Immediately follows" in datetime_string: datetime_string, _ = datetime_string.split( "Immediately follows") if "canceled" in datetime_string.lower(): continue if "TBA" in datetime_string: continue datetime_string = datetime_string.strip() try: when = dt.datetime.strptime(datetime_string, dtfmt) except ValueError: when = dt.datetime.strptime(datetime_string, dtfmt_no_time) when = self._utc.localize(when) event = Event( name=description, start_date=when, location_name=location, description=description, ) # The description is a committee name event.add_committee(name=description) event.add_source(cal_weekly_events) agenda = metainf["agenda"].xpath(".//a") if len(agenda) > 0: agenda = agenda for doc in agenda: if not doc.text_content(): continue agenda_url = doc.attrib["href"] self.add_agenda(agenda_url, doc.text_content(), event) yield event