def scrape(self, chamber=None): # we need to GET the page once to set up the ASP.net vars # then POST to it to set it to monthly url = "https://www.okhouse.gov/Committees/MeetingNotices.aspx" params = { "__EVENTTARGET": "ctl00$ContentPlaceHolder1$cbMonthly", "ctl00$ScriptManager1": "ctl00$ContentPlaceHolder1$ctl00$ContentPlaceHolder1$RadAjaxPanel1Panel|ctl00$ContentPlaceHolder1$cbMonthly", "ctl00_FormDecorator1_ClientState": "", "ctl00_RadToolTipManager1_ClientState": "", "ctl00_mainNav_ClientState": "", "ctl00$ContentPlaceHolder1$cbToday": "on", "ctl00$ContentPlaceHolder1$cbMonthly": "on", "ctl00_ContentPlaceHolder1_dgrdNotices_ClientState": "", "__ASYNCPOST": "true", "RadAJAXControlID": "ctl00_ContentPlaceHolder1_RadAjaxPanel1", } page = self.get(url).content page = lxml.html.fromstring(page) html = self.asp_post(url, page, params) page = lxml.html.fromstring(html) for row in page.xpath('//tr[contains(@id,"_dgrdNotices_")]'): status = "tentative" agenda_link = row.xpath('.//a[@id="hlMeetAgenda"]')[0] title = agenda_link.xpath("text()")[0].strip() agenda_url = agenda_link.xpath("@href")[0] location = row.xpath("td[3]")[0].text_content().strip() # swap in a space for the <br/> when = row.xpath("td[4]")[0] for br in when.xpath(".//br"): br.tail = " " + br.tail if br.tail else " " when = when.text_content().strip() if "cancelled" in when.lower(): status = "cancelled" when = re.sub("CANCELLED", "", when, re.IGNORECASE) when = self._tz.localize(dateutil.parser.parse(when)) event = Event( name=title, location_name=location, start_date=when, classification="committee-meeting", status=status, ) event.add_source(url) event.add_document("Agenda", agenda_url, media_type="application/pdf") yield event
def scrape(self, start=None): if start is None: start = datetime.datetime.today() else: start = datetime.datetime.strptime(start, "%Y-%m-%d") date_format = "%a %b %d %Y" date_slug = start.strftime(date_format) url = f"https://www.legis.ga.gov/api/meetings?startDate={date_slug}" page = self.get(url).json() for row in page: status = "tentative" title = row["subject"] if "joint" not in title.lower(): if row["chamber"] == 2: title = f"Senate {title}" elif row["chamber"] == 1: title = f"House {title}" start = dateutil.parser.parse(row["start"]) if start < self.tz.localize(datetime.datetime.now()): status = "passed" if "cancelled" in title.lower() or "canceled" in title.lower(): status = "cancelled" # try to replace all variants of "[optional dash] cancel[l]ed [optional dash]" # so we can match up events to their pre-cancellation occurrence title = re.sub(r"-?\s*cancell?ed\s*-?\s*", " ", title, flags=re.I) where = row["location"] where = f"206 Washington St SW, Atlanta, Georgia, {where}" event = Event( name=title, start_date=start, location_name=where, classification="committee-meeting", status=status, ) if row["agendaUri"] != "": event.add_document( "Agenda", row["agendaUri"], media_type="application/pdf" ) if row["livestreamUrl"] is not None: event.add_media_link( "Video", row["livestreamUrl"], media_type="text/html" ) event.add_source("https://www.legis.ga.gov/schedule/all") yield event
def scrape_events(self, session, start_date): session_key = SESSION_KEYS[session] if start_date is None: start_date = datetime.date.today() else: start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") committees_by_code = {} committees_response = self.api_client.get("committees", session=session_key) for committee in committees_response: committees_by_code[ committee["CommitteeCode"]] = committee["CommitteeName"] meetings_response = self.api_client.get( "committee_meetings", start_date=start_date.strftime(self._DATE_FORMAT), session=session_key, ) if len(meetings_response) == 0: raise EmptyScrape for meeting in meetings_response: event_date = self._TZ.localize( datetime.datetime.strptime(meeting["MeetingDate"], self._DATE_FORMAT)) com_name = committees_by_code[meeting["CommitteeCode"]] event = Event(start_date=event_date, name=com_name, location_name=meeting["Location"]) event.add_source(meeting["AgendaUrl"]) event.extras["meeting_guid"] = meeting["MeetingGuid"] event.extras["committee_code"] = committee["CommitteeCode"] event.add_participant(com_name, type="committee", note="host") for row in meeting["CommitteeAgendaItems"]: if row["Comments"] is not None: agenda = event.add_agenda_item(row["Comments"]) if row["MeasureNumber"] is not None: bill_id = "{} {}".format(row["MeasurePrefix"], row["MeasureNumber"]) agenda.add_bill(bill_id) for row in meeting["CommitteeMeetingDocuments"]: event.add_document( note=row["ExhibitTitle"], url=row["DocumentUrl"], on_duplicate="ignore", ) yield event
def scrape(self, session=None): if not session: session = self.latest_session() self.info("no session specified, using %s", session) # figuring out starting year from metadata for item in self.jurisdiction.legislative_sessions: if item["identifier"] == session: start_year = item["start_date"][:4] self.year = start_year break url = f"https://www.legis.nd.gov/assembly/{session}-{start_year}/committees/interim/committee-meeting-summary" page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) for table in page.xpath('//table[contains(@class,"views-table")]'): com = table.xpath("caption/a")[0].text_content().strip() for row in table.xpath("tbody/tr"): date_link = row.xpath("td[1]/strong/a")[0] event_url = date_link.xpath("@href")[0] date = date_link.xpath("span")[0].text_content().strip() date = dateutil.parser.parse(date) date = self._tz.localize(date) self.event_months.add(date.strftime("%Y-%m")) location = "See Agenda" event = Event(name=com, start_date=date, location_name=location) event.add_source(event_url) for link in row.xpath("td[2]//a"): link_text = link.text_content().strip() # skip live broadcast links if "video.legis" in link_text: continue event.add_document(link_text, link.xpath("@href")[0], media_type="application/pdf") self.events[event_url] = event for year_month in self.event_months: self.scrape_calendar(year_month) for key in self.events: yield self.events[key]
def scrape(self): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath(".//a[contains(@title,'Committee Details')]") if len(comit_url) != 1: continue comit_url = comit_url[0] who = self.scrape_participants(comit_url.attrib["href"]) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib["href"] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") if cttie: cttie = cttie.replace("Committee on", "").strip() cttie = f"{chamber} {cttie}" name = cttie event = Event( name=name, location_name=where, start_date=self._tz.localize(when) ) event.add_source(calurl) event.add_committee(cttie, note="host") event.add_document("notice", notice, media_type="application/pdf") for entry in what: item = event.add_agenda_item(entry) if entry.startswith("AB") or entry.startswith("SB"): item.add_bill(entry) for thing in who: event.add_person(thing["name"]) yield event
def scrape(self, start=None, end=None): if start is None: start = dt.datetime.today() else: start = dateutil.parser.parse(start) if end is None: end = start + relativedelta(months=+3) else: end = dateutil.parser.parse(end) start = start.strftime("%Y-%m-%d") end = end.strftime("%Y-%m-%d") url = f"{self.base_url}calendar-data?start={start}&end={end}" data = json.loads(self.scraper.get(url).content) for item in data: name = item["title"].strip() if "canceled" in name.lower(): continue if "house session" in name.lower( ) or "senate session" in name.lower(): continue url = f"{self.base_url}{item['url']}" when = dateutil.parser.parse(item["start"]) when = self._tz.localize(when) page = self.scraper.get(url).content page = lxml.html.fromstring(page) location = page.xpath( '//div[contains(@class,"eventModule") and h3[contains(text(), "Location")]]/text()' )[0].strip() agenda_url = page.xpath( '//a[contains(@class,"linkButton") and contains(text(),"Agenda")]/@href' )[0] event = Event( name=name, start_date=when, location_name=location, ) event.add_participant(name, type="committee", note="host") event.add_document("Agenda", agenda_url, media_type="application/pdf") event.add_source(url) yield event
def scrape_cal_page(self, url): page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) for row in page.xpath("//article[contains(@class,'accordion')]"): when = row.xpath(".//time/@datetime")[0] when = dateutil.parser.parse(when) title = row.xpath( ".//h3[contains(@class,'heading-link')]/text()")[0].strip() description = row.xpath( "section/div[contains(@class,'large-8')]/div[contains(@class,'base')]" )[0].text_content() # fix special chars description = (description.replace("\n\u2013", " ").replace( "\n", " ").replace("\u203a", "")) description = description.replace("More about this event", "").strip() location = row.xpath( "header/div/div[contains(@class,'large-8')]/div/div[contains(@class,'text-right')]/p" )[0].text_content() event = Event( name=title, description=description, start_date=when, location_name=location, ) agenda_url = row.xpath( ".//a[contains(text(),'More about this event')]/@href") if agenda_url != []: event.add_document("Details and Agenda", agenda_url[0], media_type="text/html") if "committee meeting" in title.lower(): com_name = title.replace("Committee Meeting", "").strip() event.add_participant(com_name, type="commitee", note="host") event.add_source(url) yield event if page.xpath("//a[contains(text(), 'Upcoming Events')]"): next_url = page.xpath( "//a[contains(text(), 'Upcoming Events')]/@href")[0] yield from self.scrape_cal_page(next_url)
def scrape_upper_com(self, url, com, session): url = f"{url}{session}" page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) com = f"Senate {com}" for row in page.xpath('//table[@id="meetingsTbl"]/tbody/tr'): day = row.xpath("td[1]")[0].text_content().strip() time = row.xpath("td[2]")[0].text_content().strip() notice = row.xpath("td[3]")[0].text_content().strip() location = "See Agenda" # it's in the PDFs but not the web page date = dateutil.parser.parse(f"{day} {time}") date = self.tz.localize(date) if notice.lower() == "not meeting" or "cancelled" in notice.lower( ): continue event = Event(name=com, start_date=date, location_name=location) agenda_classes = [ "mtgrecord_notice", "mtgrecord_expandedAgenda", "mtgrecord_attendance", ] for agenda_class in agenda_classes: if row.xpath(f"//a[@class='{agenda_class}']"): url = row.xpath(f"//a[@class='{agenda_class}']/@href")[0] doc_name = (row.xpath(f"//a[@class='{agenda_class}']") [0].text_content().strip()) event.add_document(doc_name, url, media_type="application/pdf") for link in row.xpath("td[7]/a"): url = link.xpath("@href")[0] doc_name = link.text_content().strip() event.add_media_link(doc_name, url, "audio/mpeg") for link in row.xpath("td[9]/a"): url = link.xpath("@href")[0] doc_name = link.text_content().strip() event.add_media_link(doc_name, url, "text/html") event.add_source(url) yield event
def parse_div(self, row, chamber, com): cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0] # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip() title, location, start_date, end_date = self.parse_gcal(cal_link) event = Event(start_date=start_date, end_date=end_date, name=title, location_name=location) event.add_source( "http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx") for item in row.xpath('.//div[@class="col-xs-12a Item"]'): description = item.xpath("string(.)").strip() agenda = event.add_agenda_item(description=description) for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'): description = item.xpath("string(.)").strip() agenda = event.add_agenda_item(description=description) event.add_document( description, item.xpath("@href")[0], media_type="application/pdf", on_duplicate="ignore", ) for item in row.xpath('.//div[contains(@class,"ItemContainer")]' '[./div[@class="col-xs-1 Item"]]'): description = item.xpath("string(.)").strip() agenda = event.add_agenda_item(description=description) bill = item.xpath( './/div[@class="col-xs-1 Item"]/a/text()')[0].strip() agenda.add_bill(bill) video = row.xpath('.//a[./span[@class="OnDemand"]]') if video: event.add_media_link("Video of Hearing", video[0].xpath("@href")[0], "text/html") if "subcommittee" in title.lower(): subcom = title.split("-")[0].strip() event.add_participant(subcom, type="committee", note="host") else: event.add_participant(com, type="committee", note="host") yield event
def scrape_event(self, row): date_td = row.xpath("td[1]")[0] info_td = row.xpath("td[2]")[0] date = date_td.xpath("b")[0].text.strip() time = date_td.xpath("b/following-sibling::text()")[0].strip() date_and_time = "{} {}".format(date, time) start_date = datetime.datetime.strptime(date_and_time, "%m/%d/%y %I:%M %p") title = info_td.xpath("font[1]/strong")[0].text.strip() all_text = info_td.xpath("descendant-or-self::*/text()") notes = (line.strip() for line in all_text if line.strip()) notes = list(notes) if len(notes) > 1: # Skip the first line, which is the title notes = notes[1:] # Split out the address address = notes[0] notes = notes[1:] # The rest just becomes the description notes = "\n".join(notes) else: address = "TBD" notes = notes[0] event = Event( start_date=self._TZ.localize(start_date), name=title, location_name=address, description=notes, ) event.add_source(self.URL) if info_td.xpath('a[contains(font/text(),"agenda")]'): agenda_url = info_td.xpath("a/@href")[0] event.add_document("Agenda", url=agenda_url) yield event
def scrape_web_json(self, url): web_events = self.get(url).json() for web_event in web_events: event_start = dateutil.parser.parse(web_event["start"]) event_start = self._tz.localize(event_start) event_end = dateutil.parser.parse(web_event["end"]) event_end = self._tz.localize(event_end) event_desc = "" if "longtitle" in web_event and web_event["longtitle"] != "": event_title = web_event["longtitle"] else: event_title = web_event["title"] event_loc = web_event["body"] if event_loc in ["H", "S", "I"]: event_loc = "1700 W. Washington St., Phoenix, Arizona, 85007" if not event_loc: event_loc = "See Agenda" event = Event( name=event_title, location_name=event_loc, start_date=event_start, end_date=event_end, description=event_desc, ) if "PDFFile" in web_event: pdf_url = f"https://www.azleg.gov{web_event['PDFFile']}" event.add_document("Agenda", pdf_url, media_type="application/pdf") event.add_source("https://www.azleg.gov/Alis-Today/") yield event
def scrape_lower(self): url = "https://www.house.leg.state.mn.us/Schedules/All" page = self.lxmlize(url) for row in page.xpath('//div[contains(@class,"my-2 d-print-block")]'): # print(row.text_content()) # skip floor sessions and unlinked events if not row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b' ): continue # skip joint ones, we'll get those from the senate API if row.xpath('div[contains(@class,"card-header bg-joint")]'): continue # top-level committee com = row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()' )[0].strip() com_link = row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/@href' )[0] when = (row.xpath( 'div[contains(@class,"card-header")]/span[contains(@class,"text-white")]/text()' )[0].replace("\r\n", "").strip()) when = dateutil.parser.parse(when) when = self._tz.localize(when) if row.xpath('.//b[.="Location:"]'): where = row.xpath( './/b[.="Location:"]/following-sibling::text()[1]' )[0].strip() else: where = "See committee page" if row.xpath('.//b[.="Agenda:"]'): desc = "\n".join( row.xpath('.//b[.="Agenda:"]/following-sibling::div/text()' )).strip() else: desc = "See committee page" event = Event( name=com, start_date=when, location_name=where, classification="committee-meeting", description=desc, ) event.add_source(com_link) for bill in get_bill_ids(desc): event.add_bill(desc) if row.xpath( ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]" ): agenda = event.add_agenda_item("Bills") for bill_id in row.xpath( ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]/text()" ): agenda.add_bill(bill_id.strip()) for attachment in row.xpath(".//ul/li/div/a"): doc_url = attachment.xpath("@href")[0] doc_name = attachment.xpath("text()")[0].strip() # if they don't provide a name just use the filename if doc_name == "": parsed_url = urlparse(doc_url) doc_name = os.path.basename(parsed_url) # sometimes broken links to .msg files (emails?) are attached, # they always 404. if doc_url.endswith(".msg"): continue media_type = get_media_type(doc_url) event.add_document(doc_name, doc_url, media_type=media_type, on_duplicate="ignore") for committee in row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()' ): event.add_participant(committee, type="committee", note="host") yield event
def scrape_upper(self): url = "https://www.senate.mn/api/schedule/upcoming" data = self.get(url).json() for row in data["events"]: com = row["committee"]["committee_name"] start = dateutil.parser.parse(row["hearing_start"]) start = self._tz.localize(start) if (row["hearing_room"] and "hearing_building" in row and row["hearing_building"]): where = f"{row['hearing_building']} {row['hearing_room']}" elif "hearing_building" in row and row["hearing_building"]: where = row["hearing_building"] else: where = "TBD" description = "" if "hearing_notes" in row and row["hearing_notes"]: description = row["hearing_notes"] event = Event( name=com, location_name=where, start_date=start, classification="committee-meeting", description=description, ) for bill in get_bill_ids(description): event.add_bill(description) if "lrl_schedule_link" in row: event.add_source(row["lrl_schedule_link"]) else: if "link" in row["committee"]: if row["committee"]["link"].startswith("http"): event.add_source(row["committee"]["link"]) elif row["committee"]["link"].startswith("www"): event.add_source(f"http://{row['committee']['link']}") else: event.add_source( f"https://www.senate.mn/{row['committee']['link']}" ) elif "senate_chair_link" in row["committee"]: event.add_source( f"https://www.senate.mn/{row['committee']['senate_chair_link']}" ) if "agenda" in row: for agenda_row in row["agenda"]: if (agenda_row["description"] is None or agenda_row["description"].strip() == ""): # sometimes they have blank agendas but bills or files agenda_row["description"] = "Agenda" agenda = event.add_agenda_item(agenda_row["description"]) if "bill_type" in agenda_row: agenda.add_bill("{} {}".format( agenda_row["bill_type"].replace(".", ""), agenda_row["bill_number"], )) if "files" in agenda_row: for file_row in agenda_row["files"]: doc_name = file_row["filename"] doc_url = file_row["file_path"] # if they don't provide a name just use the filename if doc_name == "": parsed_url = urlparse(doc_url) doc_name = os.path.basename(parsed_url.path) event.add_document( doc_name, f"https://www.senate.mn/{doc_url}", media_type="text/html", on_duplicate="ignore", ) if "video_link" in row: event.add_media_link("Video", row["video_link"], "text/html") if "audio_link" in row: event.add_media_link("Audio", row["audio_link"], "text/html") yield event
def house_meeting(self, xml, source_url): title = xml.xpath("string(//meeting-details/meeting-title)") meeting_date = xml.xpath("string(//meeting-date/calendar-date)") start_time = xml.xpath("string(//meeting-date/start-time)") end_time = xml.xpath("string(//meeting-date/end-time)") start_dt = datetime.datetime.strptime( "{} {}".format(meeting_date, start_time), "%Y-%m-%d %H:%M:%S") start_dt = self._TZ.localize(start_dt) end_dt = None if end_time != "": end_dt = datetime.datetime.strptime( "{} {}".format(meeting_date, end_time), "%Y-%m-%d %H:%M:%S") end_dt = self._TZ.localize(end_dt) building = xml.xpath( "string(//meeting-details/meeting-location/capitol-complex/building)" ) address = "US Capitol" if building != "Select one": if self.buildings.get(building): building = self.buildings.get(building) room = xml.xpath( "string(//meeting-details/meeting-location/capitol-complex/room)" ) address = "{}, Room {}".format(building, room) event = Event(start_date=start_dt, name=title, location_name=address) event.add_source(source_url) coms = xml.xpath( "//committees/committee-name | //subcommittees/committee-name") for com in coms: com_name = com.xpath("string(.)") com_name = "House {}".format(com_name) event.add_participant( com_name, type="committee", note="host", ) docs = xml.xpath("//meeting-documents/meeting-document") for doc in docs: doc_name = doc.xpath("string(description)") doc_files = doc.xpath("files/file") for doc_file in doc_files: media_type = self.media_types[doc_file.get("doc-type")] url = doc_file.get("doc-url") if doc.get("type") in ["BR", "AM", "CA"]: if doc_name == "": doc_name = doc.xpath("string(legis-num)").strip() matches = re.findall(r"([\w|\.]+)\s+(\d+)", doc_name) if matches: match = matches[0] bill_type = match[0].replace(".", "") bill_number = match[1] bill_name = "{} {}".format(bill_type, bill_number) agenda = event.add_agenda_item(description=bill_name) agenda.add_bill(bill_name) if doc_name == "": try: doc_name = self.hearing_document_types[doc.get("type")] except KeyError: self.warning("Unable to find document type: {}".format( doc.get("type"))) event.add_document(doc_name, url, media_type=media_type, on_duplicate="ignore") yield event
def scrape(self): get_short_codes(self) page = self.lxmlize(URL) if page.xpath("//td[contains(string(.),'No Hearings')]"): raise EmptyScrape table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() # Multi-committee events will be CODE1/CODE2/CODE3 if "/" in committee: coms = committee.split("/") com_names = [] for com in coms: com_names.append("{} {}".format( self.chambers[self.short_ids[com]["chamber"]], self.short_ids[com]["name"], )) descr = ", ".join(com_names) elif self.short_ids.get(committee): descr = "{} {}".format( self.chambers[self.short_ids[committee]["chamber"]], self.short_ids[committee]["name"], ) else: descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0].replace(".", "").strip() when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib["href"] notice_name = notice.text # the listing page shows the same hearing in multiple rows. # combine these -- get_related_bills() will take care of adding the bills # and descriptions if notice_href in self.seen_hearings: continue else: self.seen_hearings.append(notice_href) when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") when = TIMEZONE.localize(when) event = Event( name=descr, start_date=when, classification="committee-meeting", description=descr, location_name=where, ) if "/" in committee: committees = committee.split("/") else: committees = [committee] for committee in committees: if "INFO" not in committee and committee in self.short_ids: committee = "{} {}".format( self.chambers[self.short_ids[committee]["chamber"]], self.short_ids[committee]["name"], ) event.add_committee(committee, note="host") event.add_source(URL) event.add_document(notice_name, notice_href, media_type="text/html") for bill in self.get_related_bills(notice_href): a = event.add_agenda_item(description=bill["descr"].strip()) bill["bill_id"] = bill["bill_id"].split(",")[0] a.add_bill(bill["bill_id"], note=bill["type"]) yield event
def scrape_agenda(self, url): page = self.lxmlize(url) # Get the date/time info: date_time = page.xpath("//table[@class='time_place']") if date_time == []: return date_time = date_time[0] lines = date_time.xpath("./tr") metainf = {} for line in lines: tds = line.xpath("./td") metainf[tds[0].text_content()] = tds[1].text_content() date = metainf["DATE:"] time = metainf["TIME:"] where = metainf["PLACE:"] # check for duration in time if " - " in time: start, end = time.split(" - ") am_pm_srch = re.search("(?i)(am|pm)", end) if am_pm_srch: time = " ".join([start, am_pm_srch.group().upper()]) else: time = start fmts = [ "%A, %B %d, %Y", "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y %I:%M" ] event_desc = "Meeting Notice" if "Rise" in time: datetime = date event_desc = "Meeting Notice: Starting at {}".format(time) else: datetime = "%s %s" % (date, time) if "CANCELLED" in datetime.upper(): return transtable = { "P.M": "PM", "PM.": "PM", "P.M.": "PM", "A.M.": "AM", "POSTPONED": "", "RESCHEDULED": "", "and Rise of the Senate": "", } for trans in transtable: datetime = datetime.replace(trans, transtable[trans]) datetime = datetime.strip() for fmt in fmts: try: datetime = dt.datetime.strptime(datetime, fmt) break except ValueError: continue event = Event(name=event_desc, start_date=self._tz.localize(datetime), location_name=where) event.add_source(url) # aight. Let's get us some bills! bills = page.xpath("//b/a") for bill in bills: bill_ft = bill.attrib["href"] event.add_document(bill.text_content(), bill_ft, media_type="application/pdf") root = bill.xpath("../../*") root = [x.text_content() for x in root] bill_id = "".join(root) if "SCHEDULED FOR" in bill_id: continue descr = (bill.getparent().getparent().getparent().getnext(). getnext().text_content()) for thing in replace: bill_id = bill_id.replace(thing, replace[thing]) item = event.add_agenda_item(descr) item.add_bill(bill.text_content()) committee = page.xpath("//span[@id='lblSession']")[0].text_content() event.add_participant(committee, "committee", note="host") yield event
def scrape(self, start=None, end=None): if start is None: start_date = datetime.datetime.now().strftime(self.date_format) # default to 90 days if no end if end is None: dtdelta = datetime.timedelta(days=90) end_date = datetime.datetime.now() + dtdelta end_date = end_date.strftime(self.date_format) url = f"https://www.arkleg.state.ar.us/Calendars/Meetings?tbType=&meetingStartDate={start_date}&meetingEndDate={end_date}" page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) for row in page.xpath( "//div[@id='meetingBodyWrapper']/div[contains(@class,'row')]"): row_class = row.xpath("@class")[0] if "tableSectionHeader" in row_class: day = row.xpath("div/text()")[0].strip() continue time = row.xpath( "div[contains(@class,'timeRow')]/b/text()")[0].strip() if "no meeting" in time.lower() or "cancelled" in time.lower(): continue if "upon adjournment" in time.lower(): time = "1:00 PM" title = row.xpath("div[2]/b")[0].text_content().strip() if "call of the chair" in time.lower(): time = "" else: times = re.findall(r"\d+:\d+\s*[A|P]M", time) time = times[0] when = dateutil.parser.parse(f"{day} {time}") when = self._tz.localize(when) location = row.xpath("div[2]/text()")[1].strip() event = Event( name=title, start_date=when, location_name=location, description="", ) event.add_source( "https://www.arkleg.state.ar.us/Calendars/Meetings") if row.xpath(".//a[@aria-label='Agenda']"): agenda_url = row.xpath(".//a[@aria-label='Agenda']/@href")[0] event.add_document("Agenda", agenda_url, media_type="application/pdf") if row.xpath(".//a[@aria-label='Play Video']"): video_url = row.xpath( ".//a[@aria-label='Play Video']/@href")[0] event.add_media_link("Video of Hearing", video_url, media_type="text/html") if row.xpath(".//a[@aria-label='Referred']"): bill_url = row.xpath(".//a[@aria-label='Referred']/@href")[0] self.scrape_referred_bills(event, bill_url) yield event
def scrape_chamber(self, chamber): session = self.latest_session() session_id = session_metadata.session_id_meta_data[session] chamber_abbr = self.chamber_codes[chamber] com_url = ( "https://apps.azleg.gov/api/Committee/?includeOnlyCommitteesWithAgendas=true" "&legislativeBody={}&sessionId={}&standingOnly=true&interimOnly=false&jointCommitteesOnly=false" ) com_url = com_url.format(chamber_abbr, session_id) coms = self.get(com_url).json() for com in coms: # joint committees get returned by both endpoints, so skip one if com["LegislativeBody"] != chamber_abbr: continue # https://apps.azleg.gov/api/Agenda/?showPassed=true&sessionId=123 # &isInterimAgenda=false&body=S&includeItems=false&committeeId=1960 events_url = ( "https://apps.azleg.gov/api/Agenda/?includeItems=true&showPassed=true" "&sessionId={}&isInterimAgenda=false&body={}&committeeId={}") events_url = events_url.format(session_id, chamber_abbr, com["CommitteeId"]) events_list = self.get(events_url).json() for row in events_list: if (row["AgendaCanceled"] is True or "not meeting" in row["Time"].lower()): continue title = "{} {}".format(self.code_chambers[chamber_abbr], row["CommitteeName"]) # fix for dateutil parser confusion row["Time"] = row["Time"].replace("A.M.", "AM").replace("P.M.", "PM") if "upon rec" not in row["Time"].lower(): time = re.findall(r"(\d+:\d+\s+[A|P]M)", row["Time"]) if len(time) == 0: self.warning( f"Unable to get time for {row['Time']} on {title}") time = "00:00:00" else: time = time[0] time = time.replace(r"\s+", " ") else: time = "" when = dateutil.parser.parse(f"{row['Date']} {time}") when = self._tz.localize(when) where = "{}, Room {}".format(self.address, row["Room"]) description = "" event = Event( name=title, location_name=where, start_date=when, description=description, ) event.add_document("Agenda", row["HttpPath"], media_type="text/html") event.add_document("Agenda", row["HttpPdfPath"], media_type="application/pdf") event.add_participant(row["CommitteeName"], type="committee", note="host") for item in row["Items"]: agenda_item = event.add_agenda_item(item["Description"]) bill_id = re.findall(r"^(.*?)\s", item["Description"]) bill_id = bill_id[0] agenda_item.add_bill(bill_id) for speaker in item["RequestsToSpeak"]: speaker_title = speaker["Name"] if speaker["Representing"] != "Self": speaker_title = ( f"{speaker['Name']} ({speaker['Representing']})" ) event.add_participant(speaker_title, type="person", note="speaker") event.add_source( "https://apps.azleg.gov/BillStatus/AgendaSearch") yield event
def scrape(self, chamber=None, session=None): """ Scrape the events data from all dates from the sc meetings page, then create and yield the events objects from the data. :param chamber: :param session: :return: yielded Event objects """ chambers = { "upper": {"name": "Senate", "title": "Senator"}, "lower": {"name": "House", "title": "Representative"}, } if chamber == "other": return if chamber is None: self.info("no chamber specified, using Joint Committee Meeting Schedule") events_url = "http://www.scstatehouse.gov/meetings.php" else: events_url = "http://www.scstatehouse.gov/meetings.php?chamber=%s" % ( chambers[chamber]["name"].upper()[0] ) page = self.get_page_from_url(events_url) meeting_year = page.xpath('//h2[@class="barheader"]/span')[0].text_content() meeting_year = re.search( r"Week of [A-Z][a-z]+\s+[0-9]{1,2}, ([0-9]{4})", meeting_year ).group(1) dates = page.xpath("//div[@id='contentsection']/ul") for date in dates: date_string = date.xpath("span") if len(date_string) == 1: date_string = date_string[0].text_content() else: continue # If a event is in the next calendar year, the date_string # will have a year in it if date_string.count(",") == 2: event_year = date_string[-4:] date_string = date_string[:-6] elif date_string.count(",") == 1: event_year = meeting_year else: raise AssertionError("This is not a valid date: '{}'").format( date_string ) for meeting in date.xpath("li"): time_string = meeting.xpath("span")[0].text_content() if ( time_string == "CANCELED" or len(meeting.xpath('.//span[contains(text(), "CANCELED")]')) > 0 ): continue time_string = normalize_time(time_string) date_time = datetime.datetime.strptime( event_year + " " + date_string + " " + time_string, "%Y %A, %B %d %I:%M %p", ) date_time = self._tz.localize(date_time) meeting_info = meeting.xpath("br[1]/preceding-sibling::node()")[1] location, description = re.search( r"-- (.*?) -- (.*)", meeting_info ).groups() # if re.search(r'committee', description, re.I): # meeting_type = 'committee:meeting' # else: # meeting_type = 'other:meeting' event = Event( name=description, # Event Name start_date=date_time, # When the event will take place location_name=location, ) # Where the event will be event.add_source(events_url) agenda_url = meeting.xpath(".//a[contains(@href,'agendas')]") if agenda_url: agenda_url = agenda_url[0].attrib["href"] event.add_source(agenda_url) event.add_document( note="Agenda", url=agenda_url, media_type="application/pdf" ) agenda_page = self.get_page_from_url(agenda_url) for bill in agenda_page.xpath( ".//a[contains(@href,'billsearch.php')]" ): # bill_url = bill.attrib['href'] bill_id = bill.text_content().replace(".", "").replace(" ", "") # bill_description = self.get_bill_description(bill_url) event.add_bill(bill_id) yield event
def scrape_committee_page(self, url): page = self.get(url, headers=self.cf_headers).content page = lxml.html.fromstring(page) page.make_links_absolute(url) com = page.xpath( '//div[contains(@class, "pull-left span8")]/h1/text()')[0].strip() for row in page.xpath('//div[contains(@id, "agenda-item")]'): # status = "tentative" meta = row.xpath( 'div[contains(@class,"accordion-heading-agenda")]/a')[0] date = meta.xpath("text()")[0].strip() time_and_loc = meta.xpath("span/text()")[0].strip() time_and_loc = time_and_loc.split("\n") time = time_and_loc[0] loc = time_and_loc[1] if loc == "": loc = "See Agenda" com = com.replace("(S)", "Senate").replace("(H)", "House") # Indiana has a LOT of undefined times, stuff like "15 mins after adj. of elections" # so just remove the time component if it won't parse, and the user can go to the agenda try: when = dateutil.parser.parse(f"{date} {time}") except dateutil.parser._parser.ParserError: when = dateutil.parser.parse(date) when = self._tz.localize(when) if "cancelled" in time.lower(): continue event = Event( name=com, start_date=when, location_name=loc, classification="committee-meeting", ) event.add_source(url) event.add_participant(com, type="committee", note="host") if row.xpath('.//a[contains(text(), "View Agenda")]'): agenda_url = row.xpath( './/a[contains(text(), "View Agenda")]/@href')[0] event.add_document("Agenda", agenda_url, media_type="application/pdf") if row.xpath('.//a[contains(text(), "Watch")]'): vid_url = row.xpath('.//a[contains(text(), "Watch")]/@href')[0] event.add_media_link("Video of Hearing", vid_url, media_type="text/html") if row.xpath('.//tr[contains(@class,"bill-container")]/td'): agenda = event.add_agenda_item("Bills under consideration") for bill_row in row.xpath( './/tr[contains(@class,"bill-container")]'): bill_id = bill_row.xpath( ".//a[contains(@class,'bill-name-link')]/text()")[0] agenda.add_bill(bill_id) yield event
def scrape(self, chamber=None): url = "https://le.utah.gov/CalServ/CalServ?month={}&year={}" year = datetime.datetime.today().year for i in range(0, 12): page = self.get(url.format(i, year)).json() if "days" in page: for day_row in page["days"]: for row in day_row["events"]: # ignore 'note', 'housefloor', 'senatefloor' if row["type"] == "meeting": status = "tentative" title = row["desc"] where = row["location"] when = dateutil.parser.parse( f"{day_row['year']}-{str(int(day_row['month'])+1)}-{day_row['day']} {row['time']}" ) when = self._tz.localize(when) if "status" in row and row["status"] == "C": status = "cancelled" event = Event( name=title, location_name=where, start_date=when, classification="committee-meeting", status=status, ) if "agenda" in row: event.add_document( "Agenda", f"{self.base_url}{row['agenda']}", media_type="text/html", on_duplicate="ignore", ) if "minutes" in row: event.add_document( "Minutes", f"{self.base_url}{row['minutes']}", media_type="text/html", on_duplicate="ignore", ) if "mediaurl" in row: event.add_media_link( "Media", f"{self.base_url}{row['mediaurl']}", media_type="text/html", on_duplicate="ignore", ) if re.findall(r"mtgID=(\d+)", row["mediaurl"]): hearing_id = re.findall( r"mtgID=(\d+)", row["mediaurl"])[0] docs_url = f"https://glen.le.utah.gov/committees/meeting/{hearing_id}/1234" docs_page = self.get(docs_url).json() if "meetingMaterials" in docs_page: for mat in docs_page[ "meetingMaterials"]: agenda = event.add_agenda_item( mat["description"]) event.add_document( mat["description"], f"{self.base_url}{mat['docUrl']}", media_type="application/pdf", on_duplicate="ignore", ) for bill_row in re.findall( r"(\w{2,3}\d{4})", mat["description"]): agenda.add_bill(bill_row) # NOTE: The following data appears to be duped on the meetingMaterials endpoint # but leaving this in place commented out, in case that ever changes. # # rather than return an empty object this page just times out if there are no bills # so don't retry, and pass on failure # bills_url = f"https://glen.le.utah.gov/agencal/{hearing_id}/1234" # self.retry_attempts = 0 # try: # bills_page = self.get(bills_url, timeout=3).json() # if 'agendaitems' in bills_page: # for bill_row in bills_page['agendaitems']: # agenda = event.add_agenda_item(bill_row['description']) # if 'bill' in bill_row: # agenda.add_bill(bill_row['bill']) # print(bill_row) # except requests.exceptions.ReadTimeout: # pass # then reset the retry attempts to normal for other requests self.retry_attempts = 3 source_url = f"{self.base_url}{row['itemurl']}" event.add_source(source_url) yield event
def scrape(self, session=None): if session is None: session = self.latest_session() self.info("no session specified, using latest") for i in self.jurisdiction.legislative_sessions: if i["identifier"] == session: session_slug = i["_scraped_name"] url = ( "http://laws.leg.mt.gov/legprd/LAW0240W$CMTE.ActionQuery?P_SESS={session_slug}" "&P_COM_NM=&P_ACTN_DTM={start}&U_ACTN_DTM={end}&Z_ACTION2=Find") start = datetime.datetime.today() # this month and the next 2 months end = start + relativedelta.relativedelta(months=+2) url = url.format( session_slug=session_slug, start=start.strftime("%m/01/%Y"), end=end.strftime("%m/%d/%Y"), ) page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) for row in page.xpath("//table[@border]/tr"): # skip table headers if not row.xpath("td[1]/a"): continue day = row.xpath("td[2]/text()")[0].strip() time = row.xpath("td[3]/text()")[0].strip() room = row.xpath("td[4]")[0].text_content().strip() bill = row.xpath("td[5]/a[1]/text()")[0].strip() bill_title = row.xpath("td[6]/text()")[0].strip() com = row.xpath("td[1]/a[1]/text()")[0].strip() com = com.replace("(H)", "House").replace("(S)", "Senate") when = parser.parse(f"{day} {time}") when = self._tz.localize(when) when_slug = when.strftime("%Y%m%d%H%I") if com not in self.events: self.events[com] = {} if when_slug not in self.events[com]: event = Event( name=com, location_name=room, start_date=when, classification="committee-meeting", ) event.add_source(row.xpath("td[1]/a[1]/@href")[0]) else: event = self.events[com][when_slug] agenda = event.add_agenda_item(bill_title) agenda.add_bill(bill) if row.xpath('.//a[contains(@href,"/billhtml/")]'): bill_url = row.xpath( './/a[contains(@href,"/billhtml/")]/@href')[0] event.add_document(bill_title, bill_url, media_type="text/html") if row.xpath('.//a[contains(@href,"/billpdf/")]'): bill_url = row.xpath( './/a[contains(@href,"/billpdf/")]/@href')[0] event.add_document(bill_title, bill_url, media_type="application/pdf") self.events[com][when_slug] = event for com in self.events: for date in self.events[com]: yield self.events[com][date]
def scrape(self): url = "https://apps.legislature.ky.gov/legislativecalendar" page = self.get(url).content page = lxml.html.fromstring(page) for time_row in page.xpath( '//div[contains(@class,"TimeAndLocation")]'): date = (time_row.xpath( 'preceding-sibling::div[contains(@class,"DateHeading")][1]') [0].text_content().strip()) status = "tentative" if time_row.xpath('div[contains(@class,"Cancelled")]'): status = "cancelled" row_text = time_row.text_content() row_text = row_text.replace("Noon", "PM") # upon recess (of House|Senate) row_text = re.sub(r"Upon Recess(\sof\s)?(House|Senate)?", "", row_text) parts = re.split(r",|AM|PM", row_text) time = parts[0].strip() location = " ".join( x.replace(r"\xa0", "").strip() for x in parts[1:]) when = f"{date} {time}" when = dateutil.parser.parse(when) when = self._tz.localize(when) if not time_row.xpath( 'following-sibling::div[contains(@class,"CommitteeName")][1]/a' ): continue com_name = (time_row.xpath( 'following-sibling::div[contains(@class,"CommitteeName")][1]/a' )[0].text_content().strip()) event = Event( name=com_name, start_date=when, classification="committee-meeting", location_name=location, status=status, ) if time_row.xpath( 'following-sibling::div[contains(@class,"Agenda")][1]'): agenda_row = time_row.xpath( 'following-sibling::div[contains(@class,"Agenda")][1]')[0] agenda_text = agenda_row.text_content().strip() agenda = event.add_agenda_item(agenda_text) for bill_link in agenda_row.xpath( './/a[contains(@href,"/record/")]'): agenda.add_bill(bill_link.text_content().strip()) event.add_participant(com_name, note="host", type="committee") com_page_link = time_row.xpath( 'following-sibling::div[contains(@class,"CommitteeName")][1]/a/@href' )[0] docs = self.scrape_com_docs(com_page_link) lookup_date = when.strftime("%Y-%m-%d") if lookup_date in docs["mats"]: for mat in docs["mats"][lookup_date]: event.add_document(mat["text"], mat["url"], on_duplicate="ignore") if lookup_date in docs["minutes"]: for mat in docs["minutes"][lookup_date]: event.add_document(mat["text"], mat["url"], on_duplicate="ignore") event.add_source(url) yield event
def scrape(self): today = datetime.datetime.today() url = "https://web.wyoleg.gov/LsoService/api/Calendar/Events/{}{}01" # this month and the next 2 months for add in [0, 1, 2]: test_date = today + relativedelta.relativedelta(months=+add) month_url = url.format(str(test_date.year), str(test_date.month).zfill(2)) page = self.get(month_url).json() for row in page: if row["meetingKind"] == 2: com = f"{row['meetingType']} {row['committee']['fullName']}" # skip state holidays or other non-committee hearings if com.strip() == "": continue start = parser.parse(row["startDate"]) start = self._tz.localize(start) end = parser.parse(row["endTime"]) end = self._tz.localize(end) where = row["address1"] if where == "": where = "TBD" desc = row["purpose"] event = Event( name=com, location_name=where, start_date=start, end_date=end, classification="committee-meeting", description=desc, ) for media in row["meetingMedias"]: # all these i've seen say they're octet stream but are actually youtube links event.add_media_link( media["documentType"], media["filePath"], "text/html", on_duplicate="ignore", ) for doc in row["meetingDocuments"]: event.add_document( doc["title"], f"{self.base_url}{doc['documentUrl']}", on_duplicate="ignore", ) for item in row["meetingAgendas"]: self.parse_agenda_item(event, item) bills_agenda_item = None for bill in row["sessionMeetingBills"]: if bills_agenda_item is None: bills_agenda_item = event.add_agenda_item( "Bills under Consideration") bills_agenda_item.add_bill(bill["billNumber"]) web_url = "https://www.wyoleg.gov/Calendar/{year}{month}01/Meeting?type=committee&id={meeting_id}" web_url = web_url.format( year=str(test_date.year), month=str(test_date.month).zfill(2), meeting_id=row["id"], ) event.add_source(web_url) yield event
def scrape(self, session=None, start=None, end=None): if session is None: session = self.latest_session() self.info("no session specified, using %s", session) # testimony url, we'll need it later in a loop # testmony query looks gnary but breaks down to: # $filter: (Request/PaperNumber eq 'SP0219') and (Request/Legislature eq 129) # $orderby: LastName,FirstName,Organization # $expand: Request # $select: Id,FileType,NamePrefix,FirstName,LastName,Organization, # PresentedDate,FileSize,Topic testimony_url_base = ( "http://legislature.maine.gov/backend/" "breeze/data/CommitteeTestimony?" "$filter=(Request%2FPaperNumber%20eq%20%27{}%27)%20and" "%20(Request%2FLegislature%20eq%20{})" "&$orderby=LastName%2CFirstName%2COrganization&" "$expand=Request&$select=Id%2CFileType%2CNamePrefix" "%2CFirstName%2CLastName%2COrganization%2CPresentedDate%2CFileSize%2CTopic" ) if start is None: start_date = datetime.datetime.now().isoformat() else: start_date = datetime.datetime.strptime(start, "%Y-%m-%d") start_date = start_date.isoformat() # default to 30 days if no end if end is None: dtdelta = datetime.timedelta(days=30) end_date = datetime.datetime.now() + dtdelta end_date = end_date.isoformat() else: end_date = datetime.datetime.strptime(end, "%Y-%m-%d") end_date = end_date.isoformat() bills_by_event = {} bills_url = ("http://legislature.maine.gov/backend/breeze/data/" "getCalendarEventsBills?startDate={}&endDate={}") bills_url = bills_url.format(start_date, end_date) page = json.loads(self.get(bills_url).content) for row in page: bills_by_event.setdefault(row["EventId"], []) bills_by_event[row["EventId"]].append(row) # http://legislature.maine.gov/backend/breeze/data/getCalendarEventsRaw?startDate=2019-03-01T05%3A00%3A00.000Z&endDate=2019-04-01T03%3A59%3A59.999Z&OnlyPHWS=false url = ("http://legislature.maine.gov/backend/breeze/data/" "getCalendarEventsRaw?startDate={}&endDate={}&OnlyPHWS=true") url = url.format(start_date, end_date) page = json.loads(self.get(url).content) for row in page: if row["Cancelled"] is True or row["Postponed"] is True: continue start_date = self._TZ.localize( dateutil.parser.parse(row["FromDateTime"])) end_date = self._TZ.localize( dateutil.parser.parse(row["ToDateTime"])) name = row["CommitteeName"] if name is None: name = row["Host"] address = row["Location"] address = address.replace( "Cross Building", "Cross Office Building, 111 Sewall St, Augusta, ME 04330", ) address = address.replace( "State House", "Maine State House, 210 State St, Augusta, ME 04330") event = Event( start_date=start_date, end_date=end_date, name=name, location_name=address, ) event.add_source( "http://legislature.maine.gov/committee/#Committees/{}".format( row["CommitteeCode"])) if bills_by_event.get(row["Id"]): for bill in bills_by_event[row["Id"]]: description = "LD {}: {}".format(bill["LD"], bill["Title"]) agenda = event.add_agenda_item(description=description) agenda.add_bill("LD {}".format(bill["LD"])) if bill["TestimonyCount"] > 0: test_url = testimony_url_base.format( bill["PaperNumber"], session) test_page = json.loads(self.get(test_url).content) for test in test_page: title = "{} {} - {}".format( test["FirstName"], test["LastName"], test["Organization"], ) if test["NamePrefix"] is not None: title = "{} {}".format(test["NamePrefix"], title) test_url = ( "http://legislature.maine.gov/backend/app/services" "/getDocument.aspx?doctype=test&documentId={}". format(test["Id"])) if test["FileType"] == "pdf": media_type = "application/pdf" event.add_document(note=title, url=test_url, media_type=media_type) yield event
def scrape_agenda(self, chamber, url): page = self.lxmlize(url) # Get the date/time info: date_time = page.xpath("//table[@class='time_place']") if date_time == []: return date_time = date_time[0] lines = date_time.xpath("./tr") metainf = {} for line in lines: tds = line.xpath("./td") metainf[tds[0].text_content()] = tds[1].text_content() date = metainf["DATE:"] time = metainf["TIME:"] where = metainf["PLACE:"] # check for duration in time if " - " in time: start, end = time.split(" - ") am_pm_srch = re.search("(?i)(am|pm)", end) if am_pm_srch: time = " ".join([start, am_pm_srch.group().upper()]) else: time = start fmts = [ "%A, %B %d, %Y", "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y %I:%M" ] event_desc = "Meeting Notice" if "Rise" in time: datetime = date event_desc = "Meeting Notice: Starting at {}".format(time) else: datetime = "%s %s" % (date, time) if "CANCELLED" in datetime.upper() or "CANCELED" in datetime.upper(): return if page.xpath("//span[@id='lblSession']"): event_desc = (page.xpath("//span[@id='lblSession']") [0].text_content().strip()) transtable = { "P.M": "PM", "PM.": "PM", "P.M.": "PM", "A.M.": "AM", "POSTPONED": "", "RESCHEDULED": "", "and Rise of the Senate": "", } for trans in transtable: datetime = datetime.replace(trans, transtable[trans]) datetime = datetime.strip() for fmt in fmts: try: datetime = dt.datetime.strptime(datetime, fmt) break except ValueError: continue event = Event(name=event_desc, start_date=self._tz.localize(datetime), location_name=where) event.add_document("Agenda", url, media_type="text/html", on_duplicate="ignore") event.add_source(url) # aight. Let's get us some bills! bills = page.xpath("//b/a") for bill in bills: bill_ft = bill.attrib["href"] event.add_document( bill.text_content(), bill_ft, media_type="application/pdf", on_duplicate="ignore", ) root = bill.xpath("../../*") root = [x.text_content() for x in root] bill_id = "".join(root).replace("\u00a0", "") if "SCHEDULED FOR" in bill_id: continue descr = bill.getparent().getparent().text_content().replace( "\u00a0", " ") for thing in replace: bill_id = bill_id.replace(thing, replace[thing]) item = event.add_agenda_item(descr) item.add_bill(bill_id) # sometimes bill references are just plain links or plain text. bill_links = page.xpath('//a[contains(@href,"/BillText/")]/@href') linked_bills = set() for bill_link in bill_links: bill_nums = re.findall(r"\/(\w+\d+)\.pdf", bill_link, flags=re.IGNORECASE) for bill_num in bill_nums: linked_bills.add(bill_num) # sometimes (H 1234) ends up in the title or somewhere else unlinked text_bill_nums = re.findall(r"\((\w{1,3}\s?\d+)\)", page.text_content(), flags=re.IGNORECASE) for bill_num in text_bill_nums: bill_num = bill_num.replace(" ", "") linked_bills.add(bill_num) if len(linked_bills) != 0: item = event.add_agenda_item("Bills under consideration") for bill in linked_bills: item.add_bill(bill) if page.xpath("//span[@id='lblSession']"): committee = page.xpath( "//span[@id='lblSession']")[0].text_content() event.add_participant(committee, "committee", note="host") yield event
def scrape_house_weekly_schedule(self): url = "https://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx" page = self.lxmlize(url) meeting_rows = page.xpath('//table[@id = "table229"]/tr') valid_meetings = [ row for row in meeting_rows if row.xpath("./td[1]")[0].text_content().replace("\xa0", "") and row.xpath('./td/a/img[contains(@src, "PDF-AGENDA.png")]') and "Not Meeting" not in row.xpath("./td[2]")[0].text_content() ] for meeting in valid_meetings: try: guid = meeting.xpath("./td/a[descendant::img[contains(@src," '"PDF-AGENDA.png")]]/@href')[0] # self.logger.debug(guid) self.warning("logger.debug" + guid) except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee_name = meeting.xpath("./td[1]/text()")[0].strip() meeting_string = meeting.xpath("./td[2]")[0].text_content() if "@" in meeting_string: continue # Contains no time data. date, time, location = ( [s.strip() for s in meeting_string.split(",") if s] + [None] * 3)[:3] # check for time in date because of missing comma time_srch = re.search(r"\d{2}:\d{2} (AM|PM)", date) if time_srch: location = time time = time_srch.group() date = date.replace(time, "") # self.logger.debug(location) self.warning("logger.debug" + location) year = datetime.datetime.now().year datetime_string = " ".join((date, str(year), time)) when = datetime.datetime.strptime(datetime_string, "%b %d %Y %I:%M %p") when = self._tz.localize(when) description = "Committee Meeting: {}".format(committee_name) # self.logger.debug(description) self.warning("logger.debug" + description) event = Event( name=description, start_date=self._tz.localize(when), location_name=location, ) event.add_source(url) event.add_participant(committee_name, type="committee", note="host") event.add_document(note="Agenda", url=guid, text="agenda", media_type="application/pdf") yield event
def scrape_lower(self): list_url = ( "https://virginiageneralassembly.gov/house/schedule/meetingSchedule.php" ) page = self.get(list_url).content page = lxml.html.fromstring(page) page.make_links_absolute(list_url) for row in page.xpath("//table[contains(@class, 'CODayTable')]/tbody/tr"): # TODO: it would be nice to go back in and update the record to mark it as cancelled, # but since there's no ics link it makes the day logic way more complicated if row.xpath(".//span[contains(@class, 'COCancelled')]"): continue # fallback for unlinked events source = ( "https://virginiageneralassembly.gov/house/schedule/meetingSchedule.php" ) if row.xpath(".//a[1]/text()"): title = row.xpath(".//a[1]/text()")[0].strip() source = row.xpath(".//a[1]/@href")[0] event_type = "committee-meeting" else: # skip unlinked misc events if row.xpath("td[contains(@class, 'COCommType')]/text()"): title = row.xpath("td[contains(@class, 'COCommType')]/text()")[ 0 ].strip() event_type = "other" else: continue date_link = row.xpath(".//a[@title='Add to Calendar']/@href")[0] parsed = parse.parse_qs(parse.urlparse(date_link).query) date_raw = parsed["dt"][0] location = parsed["loc"][0] start = dateutil.parser.parse(date_raw, tzinfos=self.tzinfos) # If there's a chair in parentheticals, remove them from the title # and add as a person instead chair_note = re.findall(r"\(.*\)", title) chair = None for chair_str in chair_note: title = title.replace(chair_str, "").strip() # drop the outer parens chair = chair_str[1:-1] event = Event( name=title, start_date=start, location_name=location, classification=event_type, ) event.add_source(source) if chair is not None: event.add_participant(chair, type="person", note="chair") if event_type == "committee-meeting": event.add_participant(title, type="committee", note="host") if row.xpath(".//a[contains(@class,'COAgendaLink')]"): agenda_url = row.xpath(".//a[contains(@class,'COAgendaLink')]/@href")[0] event.add_document("Agenda", agenda_url, media_type="text/html") self.scrape_lower_agenda(event, agenda_url) yield event
def scrape_chamber(self, chamber): if chamber == "upper": url = "https://legislature.idaho.gov/sessioninfo/agenda/sagenda/" elif chamber == "lower": url = "https://legislature.idaho.gov/sessioninfo/agenda/hagenda/" page = self.get(url).content page = lxml.html.fromstring(page) for row in page.xpath('//div[@id="ai1ec-container"]/div'): month = row.xpath( ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'date')]/text()" )[0].strip() day = row.xpath( ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'date')]/span/text()" )[0].strip() time_and_loc = row.xpath( ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'abbr')]/h2/text()" ) time = time_and_loc[0].strip() loc = time_and_loc[1].strip() if "not meet" in time.lower(): continue try: start = dateutil.parser.parse(f"{month} {day} {time}") except dateutil.parser._parser.ParserError: start = dateutil.parser.parse(f"{month} {day}") start = self._tz.localize(start) com = row.xpath( ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'day')]/h2/a/text()" )[0].strip() event = Event( name=com, start_date=start, location_name=loc, classification="committee-meeting", ) event.add_participant(com, type="committee", note="host") agenda_url = row.xpath( './/a[contains(text(), "Full Agenda")]/@href')[0] event.add_document("Agenda", agenda_url, media_type="application/pdf") agenda_rows = row.xpath( './/div[contains(@class,"card")]/div[contains(@id, "Agenda")]/div/table/tbody/tr' )[1:] for agenda_row in agenda_rows: subject = agenda_row.xpath("string(td[1])").strip() description = agenda_row.xpath("string(td[2])").strip() presenter = agenda_row.xpath("string(td[3])").strip() if presenter != "": agenda_text = ( f"{subject} {description} Presenter: {presenter}". strip()) event.add_participant(agenda_text, type="person", note="Presenter") else: agenda_text = f"{subject} {description}".strip() agenda = event.add_agenda_item(agenda_text) if agenda_row.xpath( 'td[1]/a[contains(@href,"/legislation/")]'): agenda.add_bill( agenda_row.xpath( 'td[1]/a[contains(@href,"/legislation/")]/text()') [0].strip()) event.add_source(url) yield event
def scrape_meeting_page(self, com_id, chamber, com_name, meeting_date, meeting_time, location): # http://www.kslegislature.org/li/b2021_22/committees/ctte_s_jud_1/documents/?date_choice=2021-03-19 meeting_page_url = ( f"http://www.kslegislature.org/li/{self.slug}/" f"committees/{com_id}/documents/?date_choice={meeting_date}") page = self.get(meeting_page_url).content page = lxml.html.fromstring(page) page.make_links_absolute(meeting_page_url) try: start_date = dateutil.parser.parse( f"{meeting_date} {meeting_time}") except dateutil.parser._parser.ParserError: start_date = dateutil.parser.parse(meeting_date) start_date = self.tz.localize(start_date) pretty_chamber = self.chamber_names[chamber].title() event = Event( start_date=start_date, name=f"{pretty_chamber} {com_name}", location_name=location, ) event.add_participant(f"{pretty_chamber} {com_name}", type="committee", note="host") # Agendas & Minutes for row in page.xpath( "//table[.//h4[contains(text(), 'Agendas')]]/table[contains(@class,'bottom')]/tbody/tr" ): doc_name = row.xpath("td[1]")[0].text_content() doc_url = row.xpath("td[2]/a/@href")[0] event.add_document(doc_name, doc_url, media_type="application/pdf") # Witness testimony for row in page.xpath("//tr[td[ul[@id='testimony-docs']]]"): doc_type = row.xpath("td[1]")[0].text_content() meta = row.xpath("td[2]/ul[@id='testimony-docs']")[0] witness = meta.xpath( "li[strong[contains(text(),'Presenter')]]/text()")[0].strip() org = "" if meta.xpath( "li[strong[contains(text(),'Organization')]]/text()"): org = meta.xpath( "li[strong[contains(text(),'Organization')]]/text()" )[0].strip() topic = meta.xpath( "li[strong[contains(text(),'Topic')]]/text()")[0].strip() if org: doc_name = f"{doc_type} - {witness} ({org}) - {topic}" else: doc_name = f"{doc_type} - {witness} - {topic}" agenda = event.add_agenda_item(doc_name) if meta.xpath("li[strong[contains(text(),'Measure')]]/text()"): bill_id = meta.xpath( "li[strong[contains(text(),'Measure')]]/text()")[0].strip( ) agenda.add_bill(bill_id) event.add_source(meeting_page_url) yield event