def scrape_committee_events(self, code, name): events_url = ( "http://www.cga.ct.gov/basin/fullcalendar/commevents.php?" "comm_code={}".format(code)) events_data = self.get(events_url, verify=False).text events = json.loads(events_data) DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ" for info in events: if info["title"] is None: self.warning("Event found with no title; it will be skipped") continue elif info["title"].startswith("CANCELLED:"): self.info( "Cancelled event found; it will be skipped: {}".format( info["title"])) continue when = datetime.datetime.strptime(info["start"], DATETIME_FORMAT) # end = datetime.datetime.strptime(info['end'], DATETIME_FORMAT) where = "{0} {1}".format(info["building"].strip(), info["location"].strip()) # end_time=self._tz.localize(end), event = Event( start_date=self._tz.localize(when), location_name=where, name=info["title"], description=info["title"], ) event.add_source(events_url) yield event
def scrape_upper(self): url = "http://www.oksenate.gov/Committees/meetingnotices.htm" page = lxml.html.fromstring(self.get(url).text) page.make_links_absolute(url) text = page.text_content() _, text = text.split("MEETING NOTICES") re_date = r"[A-Z][a-z]+,\s+[A-Z][a-z]+ \d+, \d{4}" chunks = zip(re.finditer(re_date, text), re.split(re_date, text)[1:]) for match, data in chunks: when = match.group() when = datetime.datetime.strptime(when, "%A, %B %d, %Y") lines = filter(None, [x.strip() for x in data.splitlines()]) time_ = re.search(r"^\s*TIME:\s+(.+?)\s+\x96", data, re.M).group(1) time_ = time_.replace("a.m.", "AM").replace("p.m.", "PM") time_ = time.strptime(time_, "%I:%M %p") when += datetime.timedelta(hours=time_.tm_hour, minutes=time_.tm_min) title = lines[0] where = re.search(r"^\s*PLACE:\s+(.+)", data, re.M).group(1) where = where.strip() event = Event(name=title, start_date=self._tz.localize(when), location_name=where) event.add_source(url) yield event
def event_obj(): e = Event( name="get-together", start_date=datetime.datetime.utcnow().isoformat().split(".")[0] + "Z", location_name="Joe's Place", ) e.add_source(url="http://example.com/foobar") return e
def scrape_chamber(self, chamber): url = utils.urls["events"][chamber] page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for table in page.xpath( '//table[@class="CMS-MeetingDetail-CurrMeeting"]'): date_string = table.xpath( 'ancestor::div[@class="CMS-MeetingDetail"]/div/a/@name')[0] for row in table.xpath("tr"): time_string = row.xpath( 'td[@class="CMS-MeetingDetail-Time"]/text()')[0].strip() description = ( row.xpath('td[@class="CMS-MeetingDetail-Agenda"]/div/div') [-1].text_content().strip()) location = (row.xpath('td[@class="CMS-MeetingDetail-Location"]' )[0].text_content().strip()) committees = row.xpath( './/div[@class="CMS-MeetingDetail-Agenda-CommitteeName"]/a' ) bills = row.xpath('.//a[contains(@href, "billinfo")]') try: start_date = datetime.datetime.strptime( "{} {}".format(date_string, time_string), "%m/%d/%Y %I:%M %p") except ValueError: break event = Event( name=description, start_date=self._tz.localize(start_date), location_name=location, ) event.add_source(url) if bills or committees: item = event.add_agenda_item(description) for bill in bills: parsed = urllib.parse.urlparse(bill.get("href")) qs = urllib.parse.parse_qs(parsed.query) item.add_bill("{}{} {}".format(qs["body"], qs["type"], qs["bn"])) for committee in committees: parsed = urllib.parse.urlparse(committee.get("href")) qs = urllib.parse.parse_qs(parsed.query) item.add_committee( re.sub(r" \([S|H]\)$", "", committee.text), id=qs.get("Code"), ) yield event
def scrape_lower_event(self, url): html = self.get(url).text if "not meeting" in html.lower(): self.info(f"Skipping {url}, not meeting") return page = lxml.html.fromstring(html) page.make_links_absolute(url) com = (page.xpath('//div[contains(@class,"sectionhead")]/h1') [0].text_content().strip()) com = f"House {com}" start = self.get_meeting_row(page, "Start Date") start = self.tz.localize(dateutil.parser.parse(start)) end = None if self.get_meeting_row(page, "End Date"): end = self.get_meeting_row(page, "End Date") end = self.tz.localize(dateutil.parser.parse(end)) location = self.get_meeting_row(page, "Location") summary = "" if page.xpath('//div[contains(text(),"Meeting Overview")]'): summary = (page.xpath( '//div[div[contains(text(),"Meeting Overview")]]/div[contains(@class,"ml-3")]' )[0].text_content().strip()) if end: event = Event( name=com, start_date=start, end_date=end, location_name=location, description=summary, ) else: event = Event(name=com, start_date=start, location_name=location, description=summary) event.add_source(url) for h5 in page.xpath( '//div[contains(@class,"meeting-actions-bills")]/h5'): event.add_agenda_item(h5.text_content().strip()) for agenda_item in h5.xpath("following-sibling::ul/li"): agenda_text = agenda_item.text_content().strip() agenda_text = re.sub(r"\s+\u2013\s+", " - ", agenda_text) item = event.add_agenda_item(agenda_text) found_bills = re.findall(r"H.*\s+\d+", agenda_text) if found_bills: item.add_bill(found_bills[0]) yield event
def scrape_upper_com(self, url, com, session): url = f"{url}{session}" page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) com = f"Senate {com}" for row in page.xpath('//table[@id="meetingsTbl"]/tbody/tr'): day = row.xpath("td[1]")[0].text_content().strip() time = row.xpath("td[2]")[0].text_content().strip() notice = row.xpath("td[3]")[0].text_content().strip() location = "See Agenda" # it's in the PDFs but not the web page date = dateutil.parser.parse(f"{day} {time}") date = self.tz.localize(date) if notice.lower() == "not meeting" or "cancelled" in notice.lower( ): continue event = Event(name=com, start_date=date, location_name=location) agenda_classes = [ "mtgrecord_notice", "mtgrecord_expandedAgenda", "mtgrecord_attendance", ] for agenda_class in agenda_classes: if row.xpath(f"//a[@class='{agenda_class}']"): url = row.xpath(f"//a[@class='{agenda_class}']/@href")[0] doc_name = (row.xpath(f"//a[@class='{agenda_class}']") [0].text_content().strip()) event.add_document(doc_name, url, media_type="application/pdf") for link in row.xpath("td[7]/a"): url = link.xpath("@href")[0] doc_name = link.text_content().strip() event.add_media_link(doc_name, url, "audio/mpeg") for link in row.xpath("td[9]/a"): url = link.xpath("@href")[0] doc_name = link.text_content().strip() event.add_media_link(doc_name, url, "text/html") event.add_source(url) yield event
def scrape(self, start=None): if start is None: start = datetime.datetime.today() else: start = datetime.datetime.strptime(start, "%Y-%m-%d") date_format = "%a %b %d %Y" date_slug = start.strftime(date_format) url = f"https://www.legis.ga.gov/api/meetings?startDate={date_slug}" page = self.get(url).json() for row in page: status = "tentative" title = row["subject"] if "joint" not in title.lower(): if row["chamber"] == 2: title = f"Senate {title}" elif row["chamber"] == 1: title = f"House {title}" start = dateutil.parser.parse(row["start"]) if start < self.tz.localize(datetime.datetime.now()): status = "passed" if "cancelled" in title.lower() or "canceled" in title.lower(): status = "cancelled" # try to replace all variants of "[optional dash] cancel[l]ed [optional dash]" # so we can match up events to their pre-cancellation occurrence title = re.sub(r"-?\s*cancell?ed\s*-?\s*", " ", title, flags=re.I) where = row["location"] where = f"206 Washington St SW, Atlanta, Georgia, {where}" event = Event( name=title, start_date=start, location_name=where, classification="committee-meeting", status=status, ) if row["agendaUri"] != "": event.add_document( "Agenda", row["agendaUri"], media_type="application/pdf" ) if row["livestreamUrl"] is not None: event.add_media_link( "Video", row["livestreamUrl"], media_type="text/html" ) event.add_source("https://www.legis.ga.gov/schedule/all") yield event
def scrape_chamber(self, chamber): chamber_abbr = self.chamber_abbrs[chamber] event_url = f"http://billstatus.ls.state.ms.us/htms/{chamber_abbr}_sched.htm" text = self.get(event_url).text event = None when, time, room, com, desc = None, None, None, None, None for line in text.splitlines(): # new date if re.match( r"^(MONDAY|TUESDAY|WEDNESDAY|THURSDAY|FRIDAY|SATURDAY|SUNDAY)", line, re.IGNORECASE, ): day = line.split(" ")[0].strip() # timestamp, start of a new event if re.match(r"^\d{2}:\d{2}", line) or re.match(r"^(BC|AR|AA|TBA)\+", line): # if there's an event from the previous lines, yield it if when and room and com: event = Event( name=com, start_date=when, location_name=room, classification="committee-meeting", description=desc, ) event.add_source(event_url) yield event (time, room, com) = re.split(r"\s+", line, maxsplit=2) # if it's an after recess/adjourn # we can't calculate the time so just leave it empty if re.match(r"^(BC|AR|AA|TBA)\+", line): time = "" com = com.strip() when = dateutil.parser.parse(f"{day} {time}") when = self._tz.localize(when) # reset the description so we can populate it w/ # upcoming lines (if any) desc = "" elif when and room and com: if line.strip(): desc += "\n" + line.strip() # don't forget about the last event, which won't get triggered by a new date if when and room and com: event = Event( name=com, start_date=when, location_name=room, classification="committee-meeting", description=desc, ) event.add_source(event_url) yield event
def scrape_upper(self, session_id): list_url = f"https://lis.virginia.gov/cgi-bin/legp604.exe?{session_id}+oth+MTG&{session_id}+oth+MTG" page = self.get(list_url).content page = lxml.html.fromstring(page) page.make_links_absolute(list_url) date = None # note the [td] at the end, they have some empty tr-s so skip them for row in page.xpath("//div[@id='mainC']/center/table/tr[td]"): if row.xpath("td[1]/text()")[0].strip() != "": date = row.xpath("td[1]/text()")[0].strip() description = row.xpath("td[3]/text()")[0].strip() # data on the house page is better if "senate" not in description.lower(): continue time = row.xpath("td[2]/text()")[0].strip() status = "tentative" if "CANCELLED" in time.lower(): status = "cancelled" try: when = dateutil.parser.parse(f"{date} {time}") except dateutil.parser._parser.ParserError: when = dateutil.parser.parse(date) when = self._tz.localize(when) # TODO: Post covid figure out how they post locations if "virtual" in description.lower(): location = "Virtual" else: location = "Unknown" event = Event( name=description, start_date=when, classification="committee-meeting", location_name=location, status=status, ) event.add_source(list_url) yield event
def scrape_upper(self): # http://gencourt.state.nh.us/dynamicdatafiles/Committees.txt?x=20201216031749 url = "http://gencourt.state.nh.us/senate/schedule/CalendarWS.asmx/GetEvents" page = self.get( url, headers={ "Accept": "Accept: application/json, text/javascript, */*; q=0.01", "X-Requested-With": "XMLHttpRequest", "Content-Type": "application/json; charset=utf-8", "Referer": "http://gencourt.state.nh.us/senate/schedule/dailyschedule.aspx", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36", }, ) page = json.loads(page.content) # real data is double-json encoded string in the 'd' key page = json.loads(page["d"]) event_root = "http://gencourt.state.nh.us/senate/schedule" for row in page: event_url = "{}/{}".format(event_root, row["url"]) start = dateutil.parser.parse(row["start"]) start = self._tz.localize(start) end = dateutil.parser.parse(row["end"]) end = self._tz.localize(end) title = row["title"].strip() event = Event( name=title, start_date=start, end_date=end, location_name="See Source", ) event.add_source(event_url) self.scrape_upper_details(event, event_url) yield event
def scrape_lower_event(self, url): page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) table = page.xpath('//section[@id="leg-agenda-mod"]/div/table')[0] meta = table.xpath("tr[1]/td[1]/text()") # careful, the committee name in the page #committee_div # is getting inserted via JS # so use the one from the table, and strip the chair name com_name = re.sub(r"\(.*\)", "", meta[0]) com_name = f"Assembly {com_name}" when = dateutil.parser.parse(meta[1]) when = self._tz.localize(when) location = meta[2] event = Event( name=com_name, start_date=when, location_name=location, ) event.add_participant(com_name, type="committee", note="host") event.add_source(url) if table.xpath('.//a[contains(@href, "/leg/")]'): agenda = event.add_agenda_item("Bills under Consideration") for bill_link in table.xpath('.//a[contains(@href, "/leg/")]'): agenda.add_bill(bill_link.text_content().strip()) yield event
def scrape_event_page(self, session, chamber, url, datetime): page = self.lxmlize(url) info = page.xpath("//p") metainfo = {} plaintext = "" for p in info: content = re.sub(r"\s+", " ", p.text_content()) plaintext += content + "\n" if ":" in content: key, val = content.split(":", 1) metainfo[key.strip()] = val.strip() committee = metainfo["COMMITTEE"] where = metainfo["PLACE"] if "CHAIR" in where: where, chair = where.split("CHAIR:") metainfo["PLACE"] = where.strip() metainfo["CHAIR"] = chair.strip() chair = None if "CHAIR" in metainfo: chair = metainfo["CHAIR"] plaintext = re.sub(r"\s+", " ", plaintext).strip() regexp = r"(S|J|H)(B|M|R) (\d+)" bills = re.findall(regexp, plaintext) event = Event(name=committee, start_date=self._tz.localize(datetime), location_name=where) event.dedupe_key = url event.add_source(url) event.add_participant(committee, type="committee", note="host") if chair is not None: event.add_participant(chair, type="legislator", note="chair") # add a single agenda item, attach all bills agenda = event.add_agenda_item(plaintext) for bill in bills: chamber, type, number = bill bill_id = "%s%s %s" % (chamber, type, number) agenda.add_bill(bill_id) yield event
def scrape(self): EVENTS_URL = ( "http://www.legislature.state.al.us/aliswww/ISD/InterimMeetings.aspx" ) rows = self.lxmlize(EVENTS_URL).xpath( '//table[@id="ContentPlaceHolder1_gvInterimMeeting"]/tr') for row in rows[1:]: date = row.xpath("td")[0].text_content().strip() time = row.xpath("td")[1].text_content().strip() details = row.xpath("td")[4].text_content().strip() if time != "": date_with_time = "{} {}".format(date, time) else: # sometimes the time is the first part of the decription match = re.match(r"\s*\d+:\d+\s*[ap]m", details, flags=re.I) if match: date_with_time = "{} {}".format(date, match.group()) location = row.xpath("td")[2].text_content().strip() # 11 South Union Street, Montgomery, Alabama, United States # TODO: IF location is "room (X)" add state house # TODO: REplace "state house" with address # 32°22′37.294″N 86°17′57.991″W # host = row.xpath('td')[3].text_content().strip() name = row.xpath("td")[3].text_content().strip() if name == "": continue event = Event( start_date=self._TZ.localize( dateutil.parser.parse(date_with_time)), name=name, location_name=location, description=details, ) event.add_source(EVENTS_URL) yield event
def scrape(self, start=None, end=None): if start is None: start = dt.datetime.today() else: start = dateutil.parser.parse(start) if end is None: end = start + relativedelta(months=+3) else: end = dateutil.parser.parse(end) start = start.strftime("%Y-%m-%d") end = end.strftime("%Y-%m-%d") url = f"{self.base_url}calendar-data?start={start}&end={end}" data = json.loads(self.scraper.get(url).content) for item in data: name = item["title"].strip() if "canceled" in name.lower(): continue if "house session" in name.lower( ) or "senate session" in name.lower(): continue url = f"{self.base_url}{item['url']}" when = dateutil.parser.parse(item["start"]) when = self._tz.localize(when) page = self.scraper.get(url).content page = lxml.html.fromstring(page) location = page.xpath( '//div[contains(@class,"eventModule") and h3[contains(text(), "Location")]]/text()' )[0].strip() agenda_url = page.xpath( '//a[contains(@class,"linkButton") and contains(text(),"Agenda")]/@href' )[0] event = Event( name=name, start_date=when, location_name=location, ) event.add_participant(name, type="committee", note="host") event.add_document("Agenda", agenda_url, media_type="application/pdf") event.add_source(url) yield event
def upper_parse_agenda_item(self, item): response = self.api_client.get( "meeting", year=item["agendaId"]["year"], agenda_id=item["agendaId"]["number"], committee=item["committeeId"]["name"], ) data = response["result"] chamber = data["committee"]["committeeId"]["chamber"].title() com_code = data["committee"]["committeeId"]["name"] com_name = f"{chamber} {com_code}" # each "meeting" is actually a listing page of multiple meetings of the same committee # broken out by different addendumId for addendum in data["committee"]["addenda"]["items"]: if addendum["addendumId"] != item["addendum"]: continue meeting = addendum["meeting"] when = dateutil.parser.parse(meeting["meetingDateTime"]) when = self._tz.localize(when) location = meeting["location"] description = meeting["notes"] if location == "": location = "See Committee Site" if "canceled" in description.lower(): continue event = Event( name=com_name, start_date=when, location_name=location, description=description, ) event.add_participant(com_name, type="committee", note="host") com_code = (com_code.lower().replace("'", "").replace(" ", "-").replace( ",", "")) url = f"https://www.nysenate.gov/committees/{com_code}" event.add_source(url) bills = addendum["bills"]["items"] if len(bills) > 0: agenda = event.add_agenda_item("Bills under consideration") for bill in bills: agenda.add_bill(bill["billId"]["printNo"]) yield event
def scrape_upper(self): listing_url = "https://www.senate.mo.gov/hearingsschedule/hrings.htm" html = self.get(listing_url).text # The HTML here isn't wrapped in a container per-event # which makes xpath a pain. So string split by <hr> # then parse each event's fragment for cleaner results for fragment in html.split("<hr />")[1:]: page = lxml.html.fromstring(fragment) when_date = self.row_content(page, "Date:") when_time = self.row_content(page, "Time:") location = self.row_content(page, "Room:") location = "{}, {}".format( location, "201 W Capitol Ave, Jefferson City, MO 65101") # com = self.row_content(page, 'Committee:') com = page.xpath( '//td[descendant::b[contains(text(),"Committee")]]/a/text()' )[0] com = com.split(", Senator")[0].strip() start_date = self._TZ.localize( dateutil.parser.parse("{} {}".format(when_date, when_time))) event = Event(start_date=start_date, name=com, location_name=location) event.add_source(listing_url) event.add_participant(com, type="committee", note="host") for bill_table in page.xpath( '//table[@width="85%" and @border="0"]'): bill_link = "" if bill_table.xpath(self.bill_link_xpath): agenda_line = bill_table.xpath("string(tr[2])").strip() agenda_item = event.add_agenda_item( description=agenda_line) bill_link = bill_table.xpath( self.bill_link_xpath)[0].strip() agenda_item.add_bill(bill_link) else: agenda_line = bill_table.xpath("string(tr[1])").strip() agenda_item = event.add_agenda_item( description=agenda_line) yield event
def scrape_cal_page(self, url): page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) for row in page.xpath("//article[contains(@class,'accordion')]"): when = row.xpath(".//time/@datetime")[0] when = dateutil.parser.parse(when) title = row.xpath( ".//h3[contains(@class,'heading-link')]/text()")[0].strip() description = row.xpath( "section/div[contains(@class,'large-8')]/div[contains(@class,'base')]" )[0].text_content() # fix special chars description = (description.replace("\n\u2013", " ").replace( "\n", " ").replace("\u203a", "")) description = description.replace("More about this event", "").strip() location = row.xpath( "header/div/div[contains(@class,'large-8')]/div/div[contains(@class,'text-right')]/p" )[0].text_content() event = Event( name=title, description=description, start_date=when, location_name=location, ) agenda_url = row.xpath( ".//a[contains(text(),'More about this event')]/@href") if agenda_url != []: event.add_document("Details and Agenda", agenda_url[0], media_type="text/html") if "committee meeting" in title.lower(): com_name = title.replace("Committee Meeting", "").strip() event.add_participant(com_name, type="commitee", note="host") event.add_source(url) yield event if page.xpath("//a[contains(text(), 'Upcoming Events')]"): next_url = page.xpath( "//a[contains(text(), 'Upcoming Events')]/@href")[0] yield from self.scrape_cal_page(next_url)
def scrape_events(self, page): page = lxml.html.fromstring(page) if page.xpath( "//h3[contains(text(),'There are no hearings for the date range')]" ): raise EmptyScrape return for meeting in page.xpath('//div[@class="card mb-4"]'): com = meeting.xpath( 'div[contains(@class, "card-header")]/text()')[0].strip() details = meeting.xpath( 'div[contains(@class, "card-header")]/small/text()')[0].strip( ) (location, time) = details.split(" - ") # turn room numbers into the full address if location.lower().startswith("room"): location = "1445 K St, Lincoln, NE 68508, {}".format(location) day = meeting.xpath( "./preceding-sibling::h2[@class='text-center']/text()" )[-1].strip() # Thursday February 27, 2020 1:30 PM date = "{} {}".format(day, time) event_date = self._tz.localize( datetime.datetime.strptime(date, "%A %B %d, %Y %I:%M %p")) event = Event( name=com, start_date=event_date, classification="committee-meeting", description="Committee Meeting", location_name=location, ) event.add_committee(com, note="host") for row in meeting.xpath("div/table/tr"): if not row.xpath("td[3]"): continue agenda_desc = row.xpath("td[3]/text()")[0].strip() agenda_item = event.add_agenda_item(description=agenda_desc) if row.xpath("td[1]/a"): # bill link agenda_item.add_bill( row.xpath("td[1]/a/text()")[0].strip()) event.add_source( "https://nebraskalegislature.gov/calendar/calendar.php") yield event
def scrape_upper_events(self): url = "https://www.flsenate.gov/Tracker/RSS/DailyCalendar" page = self.get(url).text feed = feedparser.parse(page) for entry in feed["entries"]: # The feed breaks the RSS standard by making the pubdate the # actual event's date, not the RSS item publish date when = datetime.datetime(*entry["published_parsed"][:6]) when = pytz.utc.localize(when) desc = entry["summary"].split(" - ")[0] location = entry["summary"].split(" - ")[1] event = Event(name=desc, start_date=when, description=desc, location_name=location) event.add_source(entry["link"]) yield event
def scrape_meeting_notice(self, item, url): # Since Event Name is not provided for all mettings. if "Joint" in str(item["CommitteeName"]): event_name = str(item["CommitteeName"]) else: event_name = "{} {}".format(str(item["CommitteeTypeName"]), str(item["CommitteeName"])) # 04/25/2012 03:00:00 PM fmt = "%m/%d/%y %I:%M %p" start_time = dt.datetime.strptime(str(item["MeetingDateTime"]), fmt) location_name = str(item["AddressAliasNickname"]) event = Event( location_name=location_name, start_date=self._tz.localize(start_time), name=event_name, description="Committee Meeting Status: {}".format( item["CommitteeMeetingStatusName"]), ) event.add_committee(name=str(item["CommitteeName"]), id=item["CommitteeId"]) html_url = f'https://legis.delaware.gov/MeetingNotice?committeeMeetingId={item["CommitteeMeetingId"]}' event.add_source(html_url) page_url = f'https://legis.delaware.gov/json/MeetingNotice/GetCommitteeMeetingItems?committeeMeetingId={item["CommitteeMeetingId"]}' page_data = [] try: page_data = self.post(page_url).json()["Data"] except json.decoder.JSONDecodeError: # No agenda items self.info(f"POST returned nothing on {page_url}") for item in page_data: a = event.add_agenda_item(description=str(item["ItemDescription"])) if item["LegislationDisplayText"] is not None: a.add_bill(item["LegislationDisplayText"]) event.add_person( name=str(item["PrimarySponsorShortName"]), id=str(item["PrimarySponsorPersonId"]), note="Sponsor", ) yield event
def scrape(self, chamber=None): # we need to GET the page once to set up the ASP.net vars # then POST to it to set it to monthly url = "https://www.okhouse.gov/Committees/MeetingNotices.aspx" params = { "__EVENTTARGET": "ctl00$ContentPlaceHolder1$cbMonthly", "ctl00$ScriptManager1": "ctl00$ContentPlaceHolder1$ctl00$ContentPlaceHolder1$RadAjaxPanel1Panel|ctl00$ContentPlaceHolder1$cbMonthly", "ctl00_FormDecorator1_ClientState": "", "ctl00_RadToolTipManager1_ClientState": "", "ctl00_mainNav_ClientState": "", "ctl00$ContentPlaceHolder1$cbToday": "on", "ctl00$ContentPlaceHolder1$cbMonthly": "on", "ctl00_ContentPlaceHolder1_dgrdNotices_ClientState": "", "__ASYNCPOST": "true", "RadAJAXControlID": "ctl00_ContentPlaceHolder1_RadAjaxPanel1", } page = self.get(url).content page = lxml.html.fromstring(page) html = self.asp_post(url, page, params) page = lxml.html.fromstring(html) for row in page.xpath('//tr[contains(@id,"_dgrdNotices_")]'): status = "tentative" agenda_link = row.xpath('.//a[@id="hlMeetAgenda"]')[0] title = agenda_link.xpath("text()")[0].strip() agenda_url = agenda_link.xpath("@href")[0] location = row.xpath("td[3]")[0].text_content().strip() # swap in a space for the <br/> when = row.xpath("td[4]")[0] for br in when.xpath(".//br"): br.tail = " " + br.tail if br.tail else " " when = when.text_content().strip() if "cancelled" in when.lower(): status = "cancelled" when = re.sub("CANCELLED", "", when, re.IGNORECASE) when = self._tz.localize(dateutil.parser.parse(when)) event = Event( name=title, location_name=location, start_date=when, classification="committee-meeting", status=status, ) event.add_source(url) event.add_document("Agenda", agenda_url, media_type="application/pdf") yield event
def scrape_meetings(self, meetings, group): """ Scrape and save event data from a list of meetings. Arguments: meetings -- A list of lxml elements containing event information group -- The type of meeting. The legislature site applies different formatting to events based on which group they correspond to. `group` should be one of the following strings: 'house', 'senate', or 'commission'. """ for meeting in meetings: when = self.get_date(meeting) description = self.get_description(meeting) location = self.get_location(meeting) if when and description and location: event = Event( name=description, start_date=when.replace(tzinfo=self.tz), description=description, location_name=location, ) agenda = self.get_agenda(meeting) if agenda: event.add_agenda_item(agenda) event.add_source(url) yield event
def scrape(self): EVENTS_URL = ( "http://www.legislature.state.al.us/aliswww/ISD/InterimMeetings.aspx" ) rows = self.lxmlize(EVENTS_URL).xpath( '//table[@id="ContentPlaceHolder1_gvInterimMeeting"]/tr') for row in rows[1:]: date = row.xpath("td")[0].text_content().strip() time = row.xpath("td")[1].text_content().strip() date_with_time = "{} {}".format(date, time) location = row.xpath("td")[2].text_content().strip() # 11 South Union Street, Montgomery, Alabama, United States # TODO: IF location is "room (X)" add state house # TODO: REplace "state house" with address # 32°22′37.294″N 86°17′57.991″W # host = row.xpath('td')[3].text_content().strip() name = row.xpath("td")[3].text_content().strip() details = row.xpath("td")[4].text_content().strip() event = Event( start_date=self._TZ.localize( datetime.datetime.strptime(date_with_time, self._DATETIME_FORMAT)), name=name, location_name=location, description=details, ) event.add_source(EVENTS_URL) yield event
def create_event(self, committee, agenda_document): name = committee["FullName"] start_date = dateutil.parser.parse(agenda_document["DocumentDate"]) location = "500 E Capitol Ave, Pierre, SD 57501" event = Event( name=name, start_date=start_date, location_name=location, classification="committee-meeting", ) return event
def scrape(self, session=None): if not session: session = self.latest_session() self.info("no session specified, using %s", session) # figuring out starting year from metadata for item in self.jurisdiction.legislative_sessions: if item["identifier"] == session: start_year = item["start_date"][:4] self.year = start_year break url = f"https://www.legis.nd.gov/assembly/{session}-{start_year}/committees/interim/committee-meeting-summary" page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) for table in page.xpath('//table[contains(@class,"views-table")]'): com = table.xpath("caption/a")[0].text_content().strip() for row in table.xpath("tbody/tr"): date_link = row.xpath("td[1]/strong/a")[0] event_url = date_link.xpath("@href")[0] date = date_link.xpath("span")[0].text_content().strip() date = dateutil.parser.parse(date) date = self._tz.localize(date) self.event_months.add(date.strftime("%Y-%m")) location = "See Agenda" event = Event(name=com, start_date=date, location_name=location) event.add_source(event_url) for link in row.xpath("td[2]//a"): link_text = link.text_content().strip() # skip live broadcast links if "video.legis" in link_text: continue event.add_document(link_text, link.xpath("@href")[0], media_type="application/pdf") self.events[event_url] = event for year_month in self.event_months: self.scrape_calendar(year_month) for key in self.events: yield self.events[key]
def parse_event(self, row, chamber): # sample event available at http://www.akleg.gov/apptester.html committee_code = row.xpath("string(Sponsor)").strip() if committee_code in self.COMMITTEES[chamber]: committee_name = "{} {}".format( self.COMMITTEES_PRETTY[chamber], self.COMMITTEES[chamber][committee_code]["name"], ) else: committee_name = "{} {}".format( self.COMMITTEES_PRETTY[chamber], "MISCELLANEOUS", ) name = "{} {}".format(self.COMMITTEES_PRETTY[chamber], row.xpath("string(Title)").strip()) # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>" if name == "": name = committee_name location = row.xpath("string(Location)").strip() # events with no location all seem to be committee hearings if location == "": location = "Alaska State Capitol, 120 4th St, Juneau, AK 99801" start_date = dateutil.parser.parse(row.xpath("string(Schedule)")) # todo: do i need to self._TZ.localize() ? event = Event(start_date=start_date, name=name, location_name=location) event.add_source("http://w3.akleg.gov/index.php#tab4") if committee_code in self.COMMITTEES[chamber]: event.add_participant(committee_name, type="committee", note="host") for item in row.xpath("Agenda/Item"): agenda_desc = item.xpath("string(Text)").strip() if agenda_desc != "": agenda_item = event.add_agenda_item(description=agenda_desc) if item.xpath("BillRoot"): bill_id = item.xpath("string(BillRoot)") # AK Bill ids have a bunch of extra spaces bill_id = re.sub(r"\s+", " ", bill_id) agenda_item.add_bill(bill_id) yield event
def scrape_chamber(self, chamber): grouped_hearings = defaultdict(list) for hearing in self.session.query(CACommitteeHearing): location = (self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description) date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {"Asm": "lower", "Sen": "upper"}[chamber_abbr] if event_chamber != chamber: continue grouped_hearings[(location, date)].append(hearing) for ((location, date), hearings) in grouped_hearings.items(): # Get list of bill_ids from the database. bill_ids = [hearing.bill_id for hearing in hearings] bills = [ "%s %s" % re.match(r"\d+([^\d]+)(\d+)", bill).groups() for bill in bill_ids ] # Dereference the committee_nr number and get display name. msg = "More than one committee meeting at (location, date) %r" msg = msg % ((location, date), ) assert len(set(hearing.committee_nr for hearing in hearings)) == 1, msg committee_name = _committee_nr[hearings.pop().committee_nr] desc = "Committee Meeting: " + committee_name event = Event(name=desc, start_date=date, location_name=committee_name) for bill_id in bills: if "B" in bill_id: type_ = "bill" else: type_ = "resolution" item = event.add_agenda_item("consideration") item.add_bill(bill_id, note=type_) event.add_person(committee_name + " Committee", note="host") event.add_source("https://downloads.leginfo.legislature.ca.gov/") yield event
def scrape_lower_item(self, page): # print(lxml.etree.tostring(page, pretty_print=True)) com = self.table_row_content(page, "Committee:") when_date = self.table_row_content(page, "Date:") when_time = self.table_row_content(page, "Time:") location = self.table_row_content(page, "Location:") if "house hearing room" in location.lower(): location = "{}, {}".format( location, "201 W Capitol Ave, Jefferson City, MO 65101") # fix some broken times, e.g. '12 :00' when_time = when_time.replace(" :", ":") # a.m. and p.m. seem to confuse dateutil.parser when_time = when_time.replace("A.M.", "AM").replace("P.M.", "PM") # some times have extra info after the AM/PM if "upon" in when_time: when_time = when_time.split("AM", 1)[0] when_time = when_time.split("PM", 1)[0] # fix '- Upcoming', '- In Progress' in dates when_date = re.sub(r"- (.*)", "", when_date).strip() try: start_date = dateutil.parser.parse(f"{when_date} {when_time}") except dateutil.parser._parser.ParserError: start_date = dateutil.parser.parse(when_date) start_date = self._TZ.localize(start_date) event = Event(start_date=start_date, name=com, location_name=location) event.add_source("https://house.mo.gov/HearingsTimeOrder.aspx") event.add_participant(com, type="committee", note="host") # different from general MO link xpath due to the <b> house_link_xpath = ('.//a[contains(@href, "Bill.aspx") ' 'or contains(@href, "bill.aspx")]/b/text()') for bill_title in page.xpath(house_link_xpath): bill_no = bill_title.split("--")[0].strip() bill_no = bill_no.replace("HCS", "").strip() agenda_item = event.add_agenda_item(description=bill_title) agenda_item.add_bill(bill_no) yield event
def scrape_chamber(self, chamber, session, start, end): page = self.get_xml(start, end) for row in xpath(page, "//wa:CommitteeMeeting"): event_cancelled = xpath(row, "string(wa:Cancelled)") if event_cancelled == "true": continue event_chamber = xpath(row, "string(wa:Agency)") if self.chambers[event_chamber] != chamber: continue event_date = datetime.datetime.strptime( xpath(row, "string(wa:Date)"), "%Y-%m-%dT%H:%M:%S" ) event_date = self._tz.localize(event_date) event_com = xpath(row, "string(wa:Committees/" "wa:Committee/wa:LongName)") agenda_id = xpath(row, "string(wa:AgendaId)") notes = xpath(row, "string(wa:Notes)") room = xpath(row, "string(wa:Room)") building = xpath(row, "string(wa:Building)") # XML has a wa:Address but it seems useless city = xpath(row, "string(wa:City)") state = xpath(row, "string(wa:State)") location = "{}, {}, {} {}".format(room, building, city, state) event = Event( name=event_com, start_date=event_date, location_name=location, description=notes, ) source_url = ( "https://app.leg.wa.gov/committeeschedules/Home/Agenda/{}".format( agenda_id ) ) event.add_source(source_url) event.add_participant(event_com, type="committee", note="host") event.extras["agendaId"] = agenda_id self.scrape_agenda_items(agenda_id, event) yield event
def scrape_page(self, url, session, chamber): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) ctty_name = doc.xpath("//span[@class='heading']")[0].text_content() tables = doc.xpath("//table[@cellpadding='3']") info = tables[0] rows = info.xpath(".//tr") metainf = {} for row in rows: tds = row.xpath(".//td") key = tds[0].text_content().strip() value = tds[1].text_content().strip() metainf[key] = value where = metainf["Location:"] subject_matter = metainf["Subject Matter:"] description = "{}, {}".format(ctty_name, subject_matter) datetime = metainf["Scheduled Date:"] datetime = re.sub(r"\s+", " ", datetime) repl = {"AM": " AM", "PM": " PM"} # Space shim. for r in repl: datetime = datetime.replace(r, repl[r]) datetime = self.localize( dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")) event = Event(description, start_date=datetime, location_name=where) event.add_source(url) if ctty_name.startswith("Hearing Notice For"): ctty_name.replace("Hearing Notice For", "") event.add_participant(ctty_name, "organization") bills = tables[1] for bill in bills.xpath(".//tr")[1:]: tds = bill.xpath(".//td") if len(tds) < 4: continue # First, let's get the bill ID: bill_id = tds[0].text_content() agenda_item = event.add_agenda_item(bill_id) agenda_item.add_bill(bill_id) return event