def scrape_meeting_notice(self, chamber, item, url): # Since Event Name is not provided for all mettings. event_name = str(item['CommitteeName']) # 04/25/2012 03:00:00 PM fmt = "%m/%d/%y %I:%M %p" start_time = dt.datetime.strptime(str(item['MeetingDateTime']), fmt) location_name = str(item['AddressAliasNickname']) event = Event(location_name=location_name, start_date=self._tz.localize(start_time), name=event_name, description='Committee Meeting Status: {}' .format(item['CommitteeMeetingStatusName']) ) event.add_source(url) event.add_committee(name=str(item['CommitteeName']), id=item['CommitteeId']) page_url = ("http://legis.delaware.gov/json/MeetingNotice/" "GetCommitteeMeetingItems?committeeMeetingId={}".format( item['CommitteeMeetingId']) ) event.add_source(page_url) page_data = self.post(page_url).json()['Data'] for item in page_data: event.add_agenda_item(description=str(item['ItemDescription'])) event.add_person(name=str(item['PrimarySponsorShortName']), id=str(item['PrimarySponsorPersonId']), note='Sponsor') yield event
def scrape_meeting_notice(self, chamber, item, url): # Since Event Name is not provided for all mettings. event_name = str(item['CommitteeName']) # 04/25/2012 03:00:00 PM fmt = "%m/%d/%y %I:%M %p" start_time = dt.datetime.strptime(str(item['MeetingDateTime']), fmt) location_name = str(item['AddressAliasNickname']) event = Event(location_name=location_name, start_date=self._tz.localize(start_time), name=event_name, description='Committee Meeting Status: {}'.format( item['CommitteeMeetingStatusName'])) event.add_source(url) event.add_committee(name=str(item['CommitteeName']), id=item['CommitteeId']) page_url = ("http://legis.delaware.gov/json/MeetingNotice/" "GetCommitteeMeetingItems?committeeMeetingId={}".format( item['CommitteeMeetingId'])) event.add_source(page_url) page_data = self.post(page_url).json()['Data'] for item in page_data: event.add_agenda_item(description=str(item['ItemDescription'])) event.add_person(name=str(item['PrimarySponsorShortName']), id=str(item['PrimarySponsorPersonId']), note='Sponsor') yield event
def scrape(self): tz = pytz.timezone("US/Eastern") get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0].replace('.', '').strip() when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib['href'] notice_name = notice.text when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") when = pytz.utc.localize(when) event = Event(name=descr, start_time=when, classification='committee-meeting', description=descr, location_name=where, timezone=tz.zone) if "/" in committee: committees = committee.split("/") else: committees = [committee] for committee in committees: if "INFO" not in committee: committee = self.short_ids.get("committee", { "chamber": "unknown", "name": committee }) else: committee = { "chamber": "joint", "name": committee, } event.add_committee(committee['name'], note='host') event.add_source(URL) event.add_document(notice_name, notice_href, media_type='text/html') for bill in self.get_related_bills(notice_href): a = event.add_agenda_item(description=bill['descr']) a.add_bill(bill['bill_id'], note=bill['type']) yield event
def scrape(self): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath( ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]") if len(comit_url) != 1: raise Exception comit_url = comit_url[0] who = self.scrape_participants(comit_url.attrib['href']) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib['href'] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") event = Event( name=name, location_name=where, start_date=self._tz.localize(when), ) event.add_source(calurl) event.add_committee(cttie, note='host') event.add_document("notice", notice, media_type='application/pdf') for entry in what: item = event.add_agenda_item(entry) if entry.startswith('AB') or entry.startswith('SB'): item.add_bill(entry) for thing in who: event.add_person(thing['name']) yield event
def scrape(self): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath( ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]") if len(comit_url) != 1: raise Exception comit_url = comit_url[0] who = self.scrape_participants(comit_url.attrib['href']) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib['href'] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") event = Event( name=name, location_name=where, start_date=self._tz.localize(when), ) event.add_source(calurl) event.add_committee(cttie, note='host') event.add_document("notice", notice, media_type='application/pdf') for entry in what: item = event.add_agenda_item(entry) if entry.startswith('AB') or entry.startswith('SB'): item.add_bill(entry) for thing in who: event.add_person(thing['name']) yield event
def scrape(self): tz = pytz.timezone("US/Eastern") get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0].replace('.', '').strip() when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib['href'] notice_name = notice.text when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") when = pytz.utc.localize(when) event = Event(name=descr, start_time=when, classification='committee-meeting', description=descr, location_name=where, timezone=tz.zone) if "/" in committee: committees = committee.split("/") else: committees = [committee] for committee in committees: if "INFO" not in committee: committee = self.short_ids.get("committee", {"chamber": "unknown", "name": committee}) else: committee = { "chamber": "joint", "name": committee, } event.add_committee(committee['name'], note='host') event.add_source(URL) event.add_document(notice_name, notice_href, media_type='text/html') for bill in self.get_related_bills(notice_href): a = event.add_agenda_item(description=bill['descr']) a.add_bill( bill['bill_id'], note=bill['type'] ) yield event
def scrape(self, session=None): if session is None: session = self.latest_session() self.info('no session specified, using %s', session) year_abr = ((int(session) - 209) * 2) + 2000 self._init_mdb(year_abr) self.initialize_committees(year_abr) records = self.access_to_csv("Agendas") for record in records: if record['Status'] != "Scheduled": continue description = record['Comments'] related_bills = [] for bill in re.findall(r"(A|S)(-)?(\d{4})", description): related_bills.append({ "bill_id": "%s %s" % (bill[0], bill[2]), "descr": description }) date_time = "%s %s" % (record['Date'], record['Time']) date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p") try: hr_name = self._committees[record['CommHouse']] except KeyError: self.warning('unknown committee code %s, skipping', record['CommHouse']) description = 'Meeting of the {}'.format(hr_name) event = Event( name=description, start_date=self._tz.localize(date_time), location_name=record['Location'] or 'Statehouse', ) item = None for bill in related_bills: item = item or event.add_agenda_item(description) item.add_bill(bill['bill_id']) event.add_committee( hr_name, id=record['CommHouse'], note='host', ) event.add_source('http://www.njleg.state.nj.us/downloads.asp') yield event
def scrape(self, session=None): if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) url = "http://legislature.vermont.gov/committee/loadAllMeetings/{}".format( year_slug ) json_data = self.get(url).text events = json.loads(json_data)["data"] for info in events: # Determine when the committee meets if ( info["TimeSlot"] == "" or info["TimeSlot"] == "1" or info["TimeSlot"] == 1 ): start_time = datetime.datetime.strptime( info["MeetingDate"], "%A, %B %d, %Y" ) all_day = True else: try: start_time = datetime.datetime.strptime( info["MeetingDate"] + ", " + info["TimeSlot"], "%A, %B %d, %Y, %I:%M %p", ) except ValueError: start_time = datetime.datetime.strptime( info["MeetingDate"] + ", " + info["StartTime"], "%A, %B %d, %Y, %I:%M %p", ) all_day = False event = Event( start_date=self.TIMEZONE.localize(start_time), all_day=all_day, name="Meeting of the {}".format(info["LongName"]), description="committee meeting", location_name="{0}, Room {1}".format( info["BuildingName"], info["RoomNbr"] ), ) event.add_source(url) event.add_committee(name=info["LongName"], note="host") yield event
def scrape(self, session=None): if session is None: session = self.latest_session() self.info('no session specified, using %s', session) year_abr = ((int(session) - 209) * 2) + 2000 self._init_mdb(year_abr) self.initialize_committees(year_abr) records = self.access_to_csv("Agendas") for record in records: if record['Status'] != "Scheduled": continue description = record['Comments'] related_bills = [] for bill in re.findall(r"(A|S)(-)?(\d{4})", description): related_bills.append({ "bill_id": "%s %s" % (bill[0], bill[2]), "descr": description }) date_time = "%s %s" % (record['Date'], record['Time']) date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p") try: hr_name = self._committees[record['CommHouse']] except KeyError: self.warning('unknown committee code %s, skipping', record['CommHouse']) description = 'Meeting of the {}'.format(hr_name) event = Event( name=description, start_date=self._tz.localize(date_time), location_name=record['Location'] or 'Statehouse', ) item = None for bill in related_bills: item = item or event.add_agenda_item(description) item.add_bill(bill['bill_id']) event.add_committee( hr_name, id=record['CommHouse'], note='host', ) event.add_source('http://www.njleg.state.nj.us/downloads.asp') yield event
def scrape(self, session=None): if session is None: session = self.latest_session() year_slug = session[5:] url = 'http://legislature.vermont.gov/committee/loadAllMeetings/{}'.\ format(year_slug) json_data = self.get(url).text events = json.loads(json_data)['data'] for info in events: # Determine when the committee meets if info['TimeSlot'] == '1': start_time = datetime.datetime.strptime(info['MeetingDate'], '%A, %B %d, %Y') all_day = True else: try: start_time = datetime.datetime.strptime( info['MeetingDate'] + ', ' + info['TimeSlot'], '%A, %B %d, %Y, %I:%M %p' ) except ValueError: start_time = datetime.datetime.strptime( info['MeetingDate'] + ', ' + info['StartTime'], '%A, %B %d, %Y, %I:%M %p' ) all_day = False event = Event( start_time=self.TIMEZONE.localize(start_time), timezone='America/New_York', all_day=all_day, name="Meeting of the {}".format(info['LongName']), description="committee meeting", location_name="{0}, Room {1}".format(info['BuildingName'], info['RoomNbr']) ) event.add_source(url) event.add_committee( name=info['LongName'], note='host' ) yield event
def scrape(self): url = 'https://lims.minneapolismn.gov/Calendar/GetCalenderList?' council_events = cal_list for c in council_events: mtg_time = datetime.strptime(c['MeetingTime'], CAL_DATE_FORMAT) dt = tz.localize(mtg_time) e = Event(name=c['CommitteeName'], start_date=dt, location_name=c['Location']) e.add_committee(c['CommitteeName']) e.add_source(url) if c['MarkedAgendaPublished'] == True: event_url = "{0}{1}/{2}".format(AGENDA_BASE_URL, c['Abbreviation'], c['AgendaId']) e.add_media_link(note="Agenda", url=event_url, media_type="link") yield e
def scrape(self, session=None): if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) url = 'http://legislature.vermont.gov/committee/loadAllMeetings/{}'.format(year_slug) json_data = self.get(url).text events = json.loads(json_data)['data'] for info in events: # Determine when the committee meets if info['TimeSlot'] == '' or info['TimeSlot'] == '1': start_time = datetime.datetime.strptime(info['MeetingDate'], '%A, %B %d, %Y') all_day = True else: try: start_time = datetime.datetime.strptime( info['MeetingDate'] + ', ' + info['TimeSlot'], '%A, %B %d, %Y, %I:%M %p' ) except ValueError: start_time = datetime.datetime.strptime( info['MeetingDate'] + ', ' + info['StartTime'], '%A, %B %d, %Y, %I:%M %p' ) all_day = False event = Event( start_date=self.TIMEZONE.localize(start_time), all_day=all_day, name="Meeting of the {}".format(info['LongName']), description="committee meeting", location_name="{0}, Room {1}".format(info['BuildingName'], info['RoomNbr']) ) event.add_source(url) event.add_committee( name=info['LongName'], note='host' ) yield event
def scrape(self): current_date = datetime.today() current_month = current_date.month current_year = current_date.year date_range = [] print(current_month) for x in range(0, 4): if not current_month == 12: cm = current_month if len(str(cm)) < 2: cm = '0{0}'.format(cm) timestamp = "{0}-{1}".format(current_year, cm) date_range.append(timestamp) current_month += 1 elif current_month == 12: cm = '12' timestamp = "{0}-{1}".format(current_year, cm) date_range.append(timestamp) current_month = 1 current_year += 1 format1 = "%A %B %d, %Y - %I:%M %p" format2 = "%A %B %d, %Y - " format3 = "%m/%d/%y" for date in date_range: root = requests.get("https://www.stpaul.gov/calendar/" + date) base = html.fromstring(root.text) items = base.xpath('.//*/div[@class="view-content"]/div') meetings = [] for i in items: if len( i.xpath( './/*/span[@class="date-display-single"]/text()') ) > 0: d = {} d['date'] = i.xpath( './/*/span[@class="date-display-single"]/text()')[0] d['info'] = i.xpath( './/*/span[@class="field-content"]/a/text()')[0] d['link'] = i.xpath( './/*/span[@class="field-content"]/a/@href')[0] meetings.append(d) for m in meetings: m['link'] = "https://www.stpaul.gov" + m['link'] for m in meetings: ppr(m['info']) r = requests.get(m['link']) b = html.fromstring(r.text) exists = b.xpath('.//div[@class="node-content clearfix"]') if len(exists) > 0: date = exists[0].xpath( './/*/span[@class="date-display-single"]/text()') loc1 = exists[0].xpath( './/*/div[@class="thoroughfare"]/text()') loc2 = exists[0].xpath('.//*/div[@class="premise"]/text()') if len(loc1) > 0: m['location'] = loc1[0] if len(loc2) > 0: m['location'] = m['location'] + " " + loc2[0] else: m['location'] = 'N/A' if ":" in date[0]: m['date'] = datetime.strptime(date[0], format1) elif "/" in date[0]: new_date = date[0].split('/') for n in new_date: if len(n) == 1: n = '0' + n new_date = '/'.join(new_date) m['date'] = datetime.strptime( new_date, format3) else: date = datetime.strptime(date[0], format2) m['date'] = date m['date'] = tz.localize(m['date']) if not 'City Council' in m[ 'info'] and not 'Legislative' in m[ 'info'] and not 'Holiday' in m['info']: event = Event(name=m['info'].strip(), start_date=m['date'], location_name=m['location']) m['name'] = m['info'].replace('Meeting', '').replace( ' - Cancelled', '').replace('Events', '').strip() event.add_committee(m['name']) elif 'Holiday' in m['info']: event = Event(name=m['info'].strip(), start_date=m['date'], location_name=m['location']) else: event = Event(name=m['info'].strip(), start_date=m['date'], location_name=m['location']) event.add_committee('Saint Paul City Council') event.add_source(m['link']) yield event
def scrape(self): get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() if self.short_ids.get(committee): descr = "{} {}".format( self.chambers[self.short_ids[committee]["chamber"]], self.short_ids[committee]["name"], ) else: descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0].replace(".", "").strip() when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib["href"] notice_name = notice.text # the listing page shows the same hearing in multiple rows. # combine these -- get_related_bills() will take care of adding the bills # and descriptions if notice_href in self.seen_hearings: continue else: self.seen_hearings.append(notice_href) when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") when = TIMEZONE.localize(when) event = Event( name=descr, start_date=when, classification="committee-meeting", description=descr, location_name=where, ) if "/" in committee: committees = committee.split("/") else: committees = [committee] for committee in committees: if "INFO" not in committee and committee in self.short_ids: committee = "{} {}".format( self.chambers[self.short_ids[committee]["chamber"]], self.short_ids[committee]["name"], ) event.add_committee(committee, note="host") event.add_source(URL) event.add_document(notice_name, notice_href, media_type="text/html") for bill in self.get_related_bills(notice_href): a = event.add_agenda_item(description=bill["descr"].strip()) a.add_bill(bill["bill_id"], note=bill["type"]) yield event
def scrape_chamber(self, chamber=None): # If chamber is None, don't exclude any events from the results based on chamber chmbr = cal_chamber_text.get(chamber) tables = url_xpath(cal_weekly_events, "//table[@class='date-table']") for table in tables: date = table.xpath("../.")[0].getprevious().text_content() trs = table.xpath("./tr") for tr in trs: order = ["time", "chamber", "type", "agenda", "location", "video"] tds = tr.xpath("./td") metainf = {} if not tds: continue for el in range(0, len(order)): metainf[order[el]] = tds[el] if chmbr and metainf['chamber'].text_content() != chmbr: self.info("Skipping event based on chamber.") continue time = metainf['time'].text_content() datetime_string = "%s %s" % \ (date.strip(' \r\n'), time.strip(' \r\n')) location = metainf['location'].text_content() description = metainf['type'].text_content() dtfmt = "%A, %B %d, %Y %I:%M %p" dtfmt_no_time = "%A, %B %d, %Y" if time == 'Cancelled': self.log("Skipping cancelled event.") continue else: if "Immediately follows H-FLOOR" in datetime_string: continue if ' Immediately follows' in datetime_string: datetime_string, _ = datetime_string.split( 'Immediately follows') if "canceled" in datetime_string.lower(): continue if "TBA" in datetime_string: continue datetime_string = datetime_string.strip() try: when = dt.datetime.strptime(datetime_string, dtfmt) except ValueError: when = dt.datetime.strptime(datetime_string, dtfmt_no_time) when = self._utc.localize(when) event = Event( name=description, start_date=when, location_name=location, description=description, ) # The description is a committee name event.add_committee(name=description) event.add_source(cal_weekly_events) agenda = metainf['agenda'].xpath(".//a") if len(agenda) > 0: agenda = agenda for doc in agenda: if not doc.text_content(): continue agenda_url = doc.attrib['href'] self.add_agenda( agenda_url, doc.text_content(), event) yield event
def scrape(self, start_time=None): if start_time is None: start_time = datetime.datetime(2017, 1, 1, 0, 0, tzinfo=pytz.utc) dupes = {} uniq = {} bad_ids = [] for i, hearing in enumerate(self.congressional_hearings(start_time)): package_id = hearing['packageId'] try: package_num, = re.findall('\d+$', package_id) except ValueError: bad_ids.append(package_id) continue # For appropriations hearings, the committees tend to # publish portions of the hearings as they are completed, # and then the final hearing are usually compiled, # printed, and added to the repository at the request of # the Committee. # # packages with 8 digits after hrg are the in-process # version # # There could be some time between the in-process and # final packages. Publication of hearings is the purview # of the committee. # # https://github.com/usgpo/api/issues/21#issuecomment-435926223 if len(package_num) == 8: continue mods_link = hearing['download']['modsLink'] response = self.get(mods_link) mods = xmltodict.parse(response.content) extension = collections.ChainMap(*mods['mods']['extension']) granule_class = extension.get('granuleClass', 'boo') if granule_class == 'ERRATA': continue meeting_type = self._meeting_type(extension) if meeting_type is None: continue held_date = extension['heldDate'] if type(held_date) is list: start_date = min(held_date) else: start_date = held_date event = Event(name=self._title(mods), start_date=start_date, classification=meeting_type, location_name='unknown') if not event.name: continue if 'number' in extension: hearing_number = '{docClass} {congress}-{number}'.format( **extension) print(hearing_number) event.extras['hearing_number'] = hearing_number for committee_d in self._unique(extension.get('congCommittee', [])): names = committee_d['name'] committee_name = self._name_type(names, 'authority-standard') if committee_name is None: committee_name = self._name_type(names, 'authority-short') if committee_d['@chamber'] == 'H': committee_name = 'House ' + committee_name elif committee_d['@chamber'] == 'S': committee_name = 'Senate ' + committee_name try: thomas_id = committee_d['@authorityId'].upper() except KeyError: thomas_id = None sub_committees = self._subcommittees(committee_d) if sub_committees: for sub_committee_d in sub_committees: sub_committee_name = sub_committee_d['name']['#text'] sub_committee_name = sub_committee_name.strip( string.punctuation) sub_committee_id = _make_pseudo_id( name=sub_committee_name, parent__identifiers__identifier=thomas_id) ret = { "name": sub_committee_name, "entity_type": 'organization', "note": 'host', "organization_id": sub_committee_id, } event.participants.append(ret) else: if thomas_id: ret = { "name": committee_name, "entity_type": 'organization', "note": 'host', "organization_id": _make_pseudo_id(identifiers__identifier=thomas_id) } event.participants.append(ret) else: event.add_committee(committee_name, note='host') links = mods['mods']['location']['url'] for link in self._unique(links): if link['@displayLabel'] == 'Content Detail': event.add_source(link['#text'], note='web') elif link['@displayLabel'] == 'HTML rendition': event.add_document('transcript', link['#text'], media_type='text/html') elif link['@displayLabel'] == 'PDF rendition': event.add_document('transcript', link['#text'], media_type='application/pdf') event.add_source(mods_link, note='API') self._unique_event(uniq, event, dupes) self._house_docs(uniq) for event in uniq.values(): yield event with open('bad_ids.txt', 'w') as f: for id in bad_ids: f.write(id + '\n')
def _parse_house_floor_xml_legislative_activity(self, xml): """ Parses XML string of House floor updates and yields them in loop. @param xml: XML of field update @type xml: string @return: complete Event object @rtype: Event """ tree = self._xml_parser(xml) congress = tree.xpath('.//legislative_congress')[0].get('congress') house_committees = self._get_current_house_committee_names() for fa in tree.xpath('.//floor_action'): fa_text = fa.xpath('.//action_description')[0].xpath('string()') eastern = pytz.timezone('US/Eastern') dt = datetime.datetime.strptime(fa.xpath('action_time')[0].get('for-search'), '%Y%m%dT%H:%M:%S') event = Event('House Floor Update on {0} at {1}.'.format(dt.strftime('%Y-%m-%d'), dt.strftime('%H:%M:%S')), eastern.localize(dt).astimezone(pytz.utc), 'US/Eastern', '', description=fa_text, classification='floor_update') event.set_location("East Capitol Street Northeast & First St SE, Washington, DC 20004", note='House Floor', url='http://www.house.gov', coordinates={'latitude': '38.889931', 'longitude': '-77.009003'}) event.add_source(self._house_floor_src_url(date_str=tree.xpath('.//legislative_day')[0].get('date')), note="Scraped from the Office of the Clerk, U.S. House of Representatives website.") event.extras['act-id'] = fa.get('act-id') event.extras['unique-id'] = fa.get('unique-id') # bills ai_b = event.add_agenda_item(description='Bills referenced by this update.') for bill in fa.xpath(".//a[@rel='bill']"): bill_name = bill.xpath('string()') ai_b.add_bill(bill_name, id=make_pseudo_id(identifier=bill_code_to_id(bill_name), congress=congress), note="Bill was referenced on the House floor.") # publaws ai_p = event.add_agenda_item(description='Public laws referenced by this update.') for law in fa.xpath(".//a[@rel='publaw']"): detail_url = '/'.join(law.get('href').split('/')[0:-2]) + '/content-detail.html' ai_p.add_bill(law.xpath('string()'), id=make_pseudo_id(**self._public_law_detail_scraper(url=detail_url)), note='Law was referenced on the House floor.') # votes ai_v = event.add_agenda_item(description='Votes referenced by this update.') for vote in fa.xpath(".//a[@rel='vote']"): vote_name = vote.xpath('string()') ai_v.add_vote(vote_name, id=make_pseudo_id(identifier=vote_code_to_id(vote_name), congress=congress), note='Vote was referenced on the House floor.') # reports for report in fa.xpath(".//a[@rel='report']"): event.add_document('Document referenced by this update.', report.get('href'), media_type='text/html') for name in house_committees: if name.replace('House ', '') in fa_text: event.add_committee(name, id=make_pseudo_id(name=name)) # TODO identify legislators and add them as participants? yield event
def scrape(self): last_events = deque(maxlen=10) for event, agenda in self.events(since=2017): other_orgs = '' extras = [] if '--em--' in event[u'Meeting Location']: location_string, note = event[u'Meeting Location'].split( '--em--')[:2] for each in note.split(' - '): if each.startswith('Join'): other_orgs = each else: extras.append(each) else: location_string = event[u'Meeting Location'] location_list = location_string.split('-', 2) location = ', '.join([each.strip() for each in location_list[0:2]]) if not location: continue when = self.toTime(event[u'Meeting Date']) response = self.get(event['iCalendar']['url'], verify=False) event_time = self.ical( response.text).subcomponents[0]['DTSTART'].dt when = when.replace(hour=event_time.hour, minute=event_time.minute) time_string = event['Meeting Time'] if time_string in ('Deferred', ): status = 'cancelled' elif self.now() < when: status = 'confirmed' else: status = 'passed' description = event['Meeting\xa0Topic'] if any(each in description for each in ('Multiple meeting items', 'AGENDA TO BE ANNOUNCED')): description = '' event_name = event['Name'] event_id = (event_name, when) if event_id in last_events: continue else: last_events.append(event_id) e = Event(name=event_name, start_date=when, description=description, location_name=location, status=status) if extras: e.extras = {'location note': ' '.join(extras)} if event['Multimedia'] != 'Not\xa0available': e.add_media_link(note='Recording', url=event['Multimedia']['url'], type="recording", media_type='text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Minutes') if event['Name'] == 'City Council Stated Meeting': participating_orgs = ['New York City Council'] elif 'committee' in event['Name'].lower(): participating_orgs = [event["Name"]] else: participating_orgs = [] if other_orgs: other_orgs = re.sub('Jointl*y with the ', '', other_orgs) participating_orgs += re.split(' and the |, the ', other_orgs) for org in participating_orgs: e.add_committee(name=org) if agenda: e.add_source(event["Meeting Details"]['url'], note='web') for item, _, _ in agenda: if item["Name"]: agenda_item = e.add_agenda_item(item["Name"]) if item["File\xa0#"]: if item['Action']: note = item['Action'] else: note = 'consideration' agenda_item.add_bill(item["File\xa0#"]['label'], note=note) else: e.add_source(self.EVENTSPAGE, note='web') yield e
def scrape(self): last_events = deque(maxlen=10) for event, agenda in self.events(since=2011) : other_orgs = '' extras = [] if '--em--' in event[u'Meeting Location'] : location_string, note = event[u'Meeting Location'].split('--em--')[:2] for each in note.split(' - ') : if each.startswith('Join') : other_orgs = each else : extras.append(each) else : location_string = event[u'Meeting Location'] location_list = location_string.split('-', 2) location = ', '.join([each.strip() for each in location_list[0:2]]) if not location : continue when = self.toTime(event[u'Meeting Date']) event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt when = when.replace(hour=event_time.hour, minute=event_time.minute) time_string = event['Meeting Time'] if time_string in ('Deferred',) : status = 'cancelled' elif self.now() < when : status = 'confirmed' else : status = 'passed' description = event['Meeting\xa0Topic'] if any(each in description for each in ('Multiple meeting items', 'AGENDA TO BE ANNOUNCED')) : description = '' event_name = event['Name'] event_id = (event_name, when) if event_id in last_events : continue else : last_events.append(event_id) e = Event(name=event_name, start_time=when, timezone=self.TIMEZONE, description=description, location_name=location, status=status) if extras : e.extras = {'location note' : ' '.join(extras)} if event['Multimedia'] != 'Not\xa0available' : e.add_media_link(note='Recording', url = event['Multimedia']['url'], type="recording", media_type = 'text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Minutes') if event['Name'] == 'City Council Stated Meeting' : participating_orgs = ['New York City Council'] elif 'committee' in event['Name'].lower() : participating_orgs = [event["Name"]] else : participating_orgs = [] if other_orgs : other_orgs = re.sub('Jointl*y with the ', '', other_orgs) participating_orgs += re.split(' and the |, the ', other_orgs) for org in participating_orgs : e.add_committee(name=org) if agenda : e.add_source(event["Meeting Details"]['url']) for item, _, _ in agenda : if item["Name"] : agenda_item = e.add_agenda_item(item["Name"]) if item["File\xa0#"] : if item['Action'] : note = item['Action'] else : note = 'consideration' agenda_item.add_bill(item["File\xa0#"]['label'], note=note) else : e.add_source(self.EVENTSPAGE) yield e
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for api_event, event in self.events(n_days_ago): when = api_event['start'] location = api_event['EventLocation'] description = event['Meeting\xa0Topic'] if any(each in description for each in ('Multiple meeting items', 'AGENDA TO BE ANNOUNCED')): description = None if description: e = Event(name=api_event["EventBodyName"], start_date=when, description=description, location_name=location, status=api_event['status']) else: e = Event(name=api_event["EventBodyName"], start_date=when, location_name=location, status=api_event['status']) e.pupa_id = str(api_event['EventId']) if event['Multimedia'] != 'Not\xa0available': e.add_media_link(note='Recording', url=event['Multimedia']['url'], type="recording", media_type='text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Minutes') location_string = event[u'Meeting Location'] location_notes, other_orgs = self._parse_location(location_string) if location_notes: e.extras = {'location note': ' '.join(location_notes)} if e.name == 'City Council Stated Meeting': participating_orgs = ['New York City Council'] elif 'committee' in e.name.lower(): participating_orgs = [e.name] else: participating_orgs = [] if other_orgs: other_orgs = re.sub('Jointl*y with the ', '', other_orgs) participating_orgs += re.split(' and the |, the ', other_orgs) for org in participating_orgs: e.add_committee(name=org) for item in self.agenda(api_event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) participants = set() for call in self.rollcalls(api_event): if call['RollCallValueName'] == 'Present': participants.add(call['RollCallPersonName'].strip()) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**api_event), note='api') try: detail_url = event['Meeting Details']['url'] except TypeError: e.add_source(self.EVENTSPAGE, note='web') else: if requests.head(detail_url).status_code == 200: e.add_source(detail_url, note='web') yield e
def scrape(self): for c in comm_base: print(c.xpath('.//h3/a/text()')) for c in comm_base: m = {} m['notice'] = c.xpath('.//p/span[@class="cal_special"]/text()') print(c.xpath('.//h3/*')) title = c.xpath('.//h3/a/text()') if len(title) == 0: continue else: m['title'] = title[0] m['link'] = c.xpath('.//h3/a/@href')[0] info_div = c.xpath('.//div[@class="calendar_p_indent"]')[0] print('one info div') if info_div is not None: info_list = info_div.xpath('.//text()') if info_list[0] == 'Room: ': m['room'] = info_list[1] if info_list[1] == 'Chair: ': chair = info_list[2] if ',' in chair: chairs = chair.replace('\xa0', '').split(',') nchairs = [] for chair in chairs: if chair.startswith('Rep.') or chair.startswith( 'Sen.'): cname = pull_middle_name(chair[4:]) nchairs.append(cname.strip()) m['chair'] = nchairs elif chair.startswith('Rep.') or chair.startswith('Sen.'): cname = pull_middle_name(chair[4:].strip()) m['chair'] = [cname.strip()] if info_list[2] == 'Chair: ': chair = info_list[3] if ',' in chair: chairs = chair.replace('\xa0', '').split(',') nchairs = [] for chair in chairs: if chair.startswith('Rep.') or chair.startswith( 'Sen.'): cname = pull_middle_name(chair[4:]) nchairs.append(cname.strip()) m['chair'] = nchairs elif chair.startswith('Rep.') or chair.startswith('Sen.'): cname = pull_middle_name(chair[4:].strip()) m['chair'] = [cname.strip()] if info_list[4] == 'Agenda: ': m['agenda'] = info_list[5] if len(m['notice']) > 0: m['notice'] = m['notice'][0] else: m['notice'] = 'N/A' ppr(m) date = c.xpath('.//p/b/text()') if len(date) < 1: print('\n\n\n\n NO DATE') ppr(m) continue m['date'] = datetime.datetime.strptime(date[0], format1) event = Event(name=m['title'], start_date=tz.localize(m['date']), location_name=m['room']) if len(m['notice']) > 0: pass event.add_committee(m['title']) event.add_source(m['link']) for chair in m['chair']: event.add_person(name=chair, note="Chair") yield event
def scrape(self): for c in senate_base: m = {} m['notice'] = c.xpath('.//p/span[@class="cal_special"]/text()') link = c.xpath('.//h3/a/@href') print('top link: ', c.xpath('.//h3/*')) if len(link) > 0: m['link'] = c.xpath('.//h3/a/@href')[0] m['title'] = c.xpath('.//h3/a/text()')[0] else: m['link'] = 'https://www.leg.state.mn.us/cal?type=all' m['title'] = c.xpath('.//h3/text()')[0] print('top link 2: ', c.xpath('.//h3/text()')) info_div = c.xpath('.//div[@class="calendar_p_indent"]') if len(info_div) > 0: info_div = info_div[0] info_list = info_div.xpath('.//text()') nchairs = [] agenda = False for il in info_list: il = il.replace('\xa0', '') if il.startswith(' and '): il = il.replace(' and ', '') if il.startswith('Room'): m['room'] = il if il.startswith('Rep.') or il.startswith('Sen.'): cname = pull_middle_name(il[4:]) nchairs.append(cname.strip()) if agenda == True: m['agenda'] = il if il == 'Agenda: ': agenda = True m['chair'] = nchairs if len(m['notice']) > 0: m['notice'] = m['notice'][0] else: m['notice'] = 'N/A' ppr(m) date = c.xpath('.//p/span/text()') if len(date) < 1: print('\n\n\n\n NO DATE') ppr(m) continue if 'or' in date[0]: date[0] = date[0].split('or')[0] m['date'] = datetime.datetime.strptime(date[0].replace('\xa0', ''), format1) ppr(m) if not 'room' in m.keys(): print('oops') m['room'] = 'Senate in session' event = Event(name=m['title'], start_date=tz.localize(m['date']), location_name=m['room']) if len(m['notice']) > 0: pass event.add_committee(m['title']) event.add_source(m['link']) for chair in m['chair']: event.add_person(name=chair, note="Chair") yield event
def scrape(self): for c in house_base: m = {} m['notice'] = c.xpath('.//p/span[@class="cal_special"]/text()') links = c.xpath('.//h3/a/@href') if len(links) > 0: m['cmt'] = c.xpath('.//h3/a/text()')[0] m['link'] = c.xpath('.//h3/a/@href')[0] title = c.xpath('.//h3/text()')[0] if title == 'Agenda:': m['title'] = c.xpath('.//h3/a/text()')[0] else: m['title'] = c.xpath('.//h3/text()')[0] else: m['title'] = c.xpath('.//h3/text()')[0] m['link'] = None info_div = c.xpath('.//*[@class="calendar_p_indent"]') if len(info_div) == 0: pass else: info_div = info_div[0] print('Info Div: ', info_div) if len(info_div) > 0: info_list = info_div.xpath('.//text()') info_links = info_div.xpath('.//*/@href') print("info links: ", info_links) info_list = [x.replace('\n', '').strip() for x in info_list] info_list = [x for x in info_list if len(x) > 0] print('Info list: ', info_list) if info_list[0].startswith('Room:'): m['room'] = info_list[1] else: m['room'] = 'n/a' if len(info_list) > 2: if info_list[2].startswith('Chair:'): chair = info_list[3] if ',' in chair: chairs = chair.replace('\xa0', '').split(',') nchairs = [] for chair in chairs: if chair.startswith('Rep.') or chair.startswith('Sen.'): cname = pull_middle_name(chair[4:]) nchairs.append(cname.strip()) m['chair'] = nchairs elif chair.startswith('Rep.') or chair.startswith('Sen.'): cname = pull_middle_name(chair[4:].strip()) m['chair'] = [cname.strip()] else: m['chair'] = None bill_rows = c.xpath(('.//*/table[@class="cal_bills"]/tbody/tr')) print('Bills: ', bill_rows) bills = [] for brs in bill_rows: cells = brs.xpath('.//td') if len(cells) == 3: b = {} b['bill'] = cells[0].xpath('.//text()')[0] b['author'] = cells[1].xpath('./text()')[0] b['summary'] = cells[2].xpath('./text()')[0] bills.append(b) if len(m['notice']) > 0: m['notice'] = m['notice'][0] else: m['notice'] = 'N/A' date = c.xpath('.//p/b/text()') if len(date) < 1: print('\n\n\n\n NO DATE') continue m['date'] = datetime.datetime.strptime(date[0], format1) if 'House Meets in Session' in m['title']: m['room'] = 'State leg' m['cmt'] = 'Minnesota House of Representatives' m['chair'] = None m['link'] = 'https://www.leg.state.mn.us/cal?type=all' event = Event(name=m['title'], start_date=tz.localize(m['date']), location_name=m['room'] ) if len(bills) > 0: for bill in bills: nbill = event.add_agenda_item(description=bill['summary']) nbill.add_bill(bill['bill'].replace('HF', 'HF ')) if len(m['notice']) > 0: pass event.add_committee(m['cmt']) if m['link'] is not None: event.add_source(m['link']) if m['chair'] is not None: for chair in m['chair']: event.add_person(name=chair, note="Chair") yield event