def scrape(self): for event, web_event in self.events(): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] else: event_name = body_name status_name = event['EventAgendaStatusName'] if status_name == 'Draft': status = 'confirmed' elif status_name == 'Final': status = 'passed' elif status_name == 'Canceled': status = 'cancelled' else: status = '' e = Event(event_name, start_date=event["start"], description='', location_name=event["EventLocation"], status=status) e.pupa_id = str(event['EventId']) for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format( item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) e.add_participant(name=body_name, type="organization") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event), note='api') if event['EventAgendaFile']: e.add_document(note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf") # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists. if web_event['Audio'] != 'Not\xa0available': redirect_url = self.head( web_event['Audio']['url']).headers['Location'] e.add_media_link(note=web_event['Audio']['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if web_event['Meeting Details'] != 'Meeting\xa0details': if requests.head(web_event['Meeting Details'] ['url']).status_code == 200: e.add_source(web_event['Meeting Details']['url'], note='web') else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, window=None) : if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [part.strip() for part in body_name.split('-')] else: event_name = body_name status_name = event['EventAgendaStatusName'] if status_name == 'Draft': status = 'confirmed' elif status_name == 'Final': status = 'passed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format(item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) e.add_participant(name=body_name, type="organization") e.add_source(self.BASE_URL + '/events/{0}'.format(event['EventId']), note='api') if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note= 'Agenda', url = event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note= 'Minutes', url = event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for api_event, event in self.events(n_days_ago): when = api_event['start'] location = api_event['EventLocation'] description = event['Meeting\xa0Topic'] if any(each in description for each in ('Multiple meeting items', 'AGENDA TO BE ANNOUNCED')): description = None if description: e = Event(name=api_event["EventBodyName"], start_date=when, description=description, location_name=location, status=api_event['status']) else: e = Event(name=api_event["EventBodyName"], start_date=when, location_name=location, status=api_event['status']) e.pupa_id = str(api_event['EventId']) if event['Multimedia'] != 'Not\xa0available': e.add_media_link(note='Recording', url=event['Multimedia']['url'], type="recording", media_type='text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Minutes') location_string = event[u'Meeting Location'] location_notes, other_orgs = self._parse_location(location_string) if location_notes: e.extras = {'location note': ' '.join(location_notes)} if e.name == 'City Council Stated Meeting': participating_orgs = ['New York City Council'] elif 'committee' in e.name.lower(): participating_orgs = [e.name] else: participating_orgs = [] if other_orgs: other_orgs = re.sub('Jointl*y with the ', '', other_orgs) participating_orgs += re.split(' and the |, the ', other_orgs) for org in participating_orgs: e.add_committee(name=org) for item in self.agenda(api_event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) participants = set() for call in self.rollcalls(api_event): if call['RollCallValueName'] == 'Present': participants.add(call['RollCallPersonName'].strip()) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**api_event), note='api') try: detail_url = event['Meeting Details']['url'] except TypeError: e.add_source(self.EVENTSPAGE, note='web') else: if requests.head(detail_url).status_code == 200: e.add_source(detail_url, note='web') yield e
def scrape(self, window=None): if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) else: n_days_ago = None for event, web_event in self.events(n_days_ago): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] else: event_name = body_name status_name = event['EventAgendaStatusName'] if status_name == 'Draft': status = 'confirmed' elif status_name == 'Final': status = 'passed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' e = Event(event_name, start_date=event["start"], description='', location_name=event["EventLocation"], status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links e.extras = {'guid': event['EventGuid']} for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format( item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) e.add_participant(name=body_name, type="organization") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event), note='api') if event['EventAgendaFile']: e.add_document(note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf") # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists. if web_event['Audio'] != 'Not\xa0available': try: redirect_url = self.head( web_event['Audio']['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet contain the # location of the audio file. Skip these events, and retry # on next scrape. continue e.add_media_link(note=web_event['Audio']['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if web_event['Meeting Details'] != 'Meeting\xa0details': if requests.head(web_event['Meeting Details'] ['url']).status_code == 200: e.add_source(web_event['Meeting Details']['url'], note='web') else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, window=3): n_days_ago = (datetime.datetime.utcnow() - datetime.timedelta(float(window))) for api_event, event in self.events(n_days_ago): description = None when = api_event['start'] location_string = event[u'Meeting Location'] location_list = location_string.split('--', 2) location = ', '.join(location_list[0:2]) if not location: continue status_string = location_list[-1].split('Chicago, Illinois') if len(status_string) > 1 and status_string[1]: status_text = status_string[1].lower() if any(phrase in status_text for phrase in ( 'rescheduled to', 'postponed to', 'reconvened to', 'rescheduled to', 'meeting recessed', 'recessed meeting', 'postponed to', 'recessed until', 'deferred', 'time change', 'date change', 'recessed meeting - reconvene', 'cancelled', 'new date and time', 'rescheduled indefinitely', 'rescheduled for', )): status = 'cancelled' elif status_text in ('rescheduled', 'recessed'): status = 'cancelled' elif status_text in ( 'meeting reconvened', 'reconvened meeting', 'recessed meeting', 'reconvene meeting', 'rescheduled hearing', 'rescheduled meeting', ): status = api_event['status'] elif status_text in ('amended notice of meeting', 'room change', 'amended notice', 'change of location', 'revised - meeting date and time'): status = api_event['status'] elif 'room' in status_text: location = status_string[1] + ', ' + location elif status_text in ('wrong meeting date', ): continue else: print(status_text) description = status_string[1].replace('--em--', '').strip() status = api_event['status'] else: status = api_event['status'] if description: e = Event(name=event["Name"]["label"], start_time=when, description=description, timezone=self.TIMEZONE, location_name=location, status=status) else: e = Event(name=event["Name"]["label"], start_time=when, timezone=self.TIMEZONE, location_name=location, status=status) e.pupa_id = str(api_event['EventId']) if event['Video'] != 'Not\xa0available': e.add_media_link(note='Recording', url=event['Video']['url'], type="recording", media_type='text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Notice') self.addDocs(e, event, 'Transcript') self.addDocs(e, event, 'Summary') participant = event["Name"]["label"] if participant == 'City Council': participant = 'Chicago City Council' elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)': participant = 'Committee on Energy, Environmental Protection and Public Utilities' e.add_participant(name=participant, type="organization") for item in self.agenda(api_event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) participants = set() for call in self.rollcalls(api_event): if call['RollCallValueName'] == 'Present': participants.add(call['RollCallPersonName']) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event), note='api') try: detail_url = event['Meeting Details']['url'] except TypeError: e.add_source(self.EVENTSPAGE, note='web') else: if requests.head(detail_url).status_code == 200: e.add_source(detail_url, note='web') yield e
def scrape(self, window=None): if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) else: n_days_ago = None events = self.events(since_datetime=n_days_ago) service_councils = set(sc['BodyId'] for sc in self.search( '/bodies/', 'BodyId', 'BodyTypeId eq 70 or BodyTypeId eq 75')) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] elif event['EventBodyId'] in service_councils: # Don't scrape service council or service council public hearing events. self.info('Skipping event {0} for {1}'.format( event['EventId'], event['EventBodyName'])) continue else: event_name = body_name # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." status_name = event['EventAgendaStatusName'] if status_name.startswith('Final'): status = 'passed' elif status_name == 'Draft': status = 'confirmed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} legistar_api_url = self.BASE_URL + '/events/{0}'.format( event['EventId']) e.add_source(legistar_api_url, note='api') if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if web_event.has_ecomment: self.info('Adding eComment link {0} from {1}'.format( web_event['eComment'], web_event['Meeting Details']['url'])) e.extras['ecomment'] = web_event['eComment'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes agenda_number = item["EventItemAgendaNumber"] note = "Agenda number, {}".format(agenda_number) agenda_item['notes'].append(note) agenda_item['extras']['agenda_number'] = agenda_number # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. agenda_item['extras']['item_agenda_sequence'] = item[ 'EventItemAgendaSequence'] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. item_agenda_sequences = [ item['extras']['item_agenda_sequence'] for item in e.agenda ] if len(item_agenda_sequences) != len( set(item_agenda_sequences)): error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' raise ValueError( error_msg.format( event_name=e.name, event_date=e.start_date.strftime("%B %d, %Y"), legistar_api_url=legistar_api_url)) e.add_participant(name=body_name, type="organization") if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document( note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf", date=self.to_utc_timestamp( event['EventAgendaLastPublishedUTC']).date()) if event['EventMinutesFile']: e.add_document( note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf", date=self.to_utc_timestamp( event['EventMinutesLastPublishedUTC']).date()) elif web_event['Published minutes'] != 'Not\xa0available': e.add_document(note=web_event['Published minutes']['label'], url=web_event['Published minutes']['url'], media_type="application/pdf") else: approved_minutes = self.find_approved_minutes(event) if approved_minutes: e.add_document( note=approved_minutes['MatterAttachmentName'], url=approved_minutes['MatterAttachmentHyperlink'], media_type="application/pdf", date=self.to_utc_timestamp( approved_minutes['MatterAttachmentLastModifiedUtc'] ).date()) for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue # Sometimes if there is an issue getting the Spanish # audio created, Metro has the Spanish Audio link # go to the English Audio. # # Pupa does not allow the for duplicate media links, # so we'll ignore the the second media link if it's # the same as the first media link. # # Because of the way that the event['audio'] is created # the first audio link is always English and the # second is always Spanish e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html', on_duplicate='ignore') if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, window=None) : if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [part.strip() for part in body_name.split('-')] else: event_name = body_name # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." status_name = event['EventAgendaStatusName'] if status_name.startswith('Final'): status = 'passed' elif status_name == 'Draft': status = 'confirmed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} legistar_api_url = self.BASE_URL + '/events/{0}'.format(event['EventId']) e.add_source(legistar_api_url, note='api') if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format(item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. agenda_item['extras']['item_agenda_sequence'] = item['EventItemAgendaSequence'] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. item_agenda_sequences = [item['extras']['item_agenda_sequence'] for item in e.agenda] if len(item_agenda_sequences) != len(set(item_agenda_sequences)): error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' raise ValueError(error_msg.format(event_name=e.name, event_date=e.start_date.strftime("%B %d, %Y"), legistar_api_url=legistar_api_url)) e.add_participant(name=body_name, type="organization") if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note= 'Agenda', url = event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note= 'Minutes', url = event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for api_event, web_event in self.events(n_days_ago): when = api_event['start'] location = api_event[u'EventLocation'] extracts = self._parse_comment(api_event[u'EventComment']) description, room, status, invalid_event = extracts if invalid_event: continue if room: location = room + ', ' + location if not status: status = api_event['status'] if description: e = Event(name=api_event["EventBodyName"], start_date=when, description=description, location_name=location, status=status) else: e = Event(name=api_event["EventBodyName"], start_date=when, location_name=location, status=status) e.pupa_id = str(api_event['EventId']) if web_event['Meeting video'] != 'Not\xa0available': e.add_media_link(note='Recording', url=web_event['Meeting video']['url'], type="recording", media_type='text/html') self.addDocs(e, web_event, 'Published agenda') self.addDocs(e, web_event, 'Notice') self.addDocs(e, web_event, 'Published summary') if 'Captions' in web_event: self.addDocs(e, web_event, 'Captions') participant = api_event["EventBodyName"] if participant == 'City Council': participant = 'Seattle City Council' # elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)': # participant = 'Committee on Energy, Environmental Protection and Public Utilities' e.add_participant(name=participant, type="organization") for item in self.agenda(api_event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) participants = set() for call in self.rollcalls(api_event): if call['RollCallValueName'] == 'Present': participants.add(call['RollCallPersonName']) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**api_event), note='api') e.add_source(web_event['Meeting Name']['url'], note='web') yield e
def scrape(self, window=30): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) self.retry_wait_seconds = 20 for api_event, event in self.events(n_days_ago): description = api_event["EventComment"] when = api_event["start"] location = api_event["EventLocation"] if location == "Council Chambers": location = "Council Chambers, 5th Floor, City-County Building, " \ "414 Grant Street, Pittsburgh, PA 15219" if not location : continue status_string = api_event["status"] if len(status_string) > 1 and status_string[1] : status_text = status_string[1].lower() if any(phrase in status_text for phrase in ("rescheduled to", "postponed to", "reconvened to", "rescheduled to", "meeting recessed", "recessed meeting", "postponed to", "recessed until", "deferred", "time change", "date change", "recessed meeting - reconvene", "cancelled", "new date and time", "rescheduled indefinitely", "rescheduled for",)) : status = "cancelled" elif status_text in ("rescheduled", "recessed") : status = "cancelled" elif status_text in ("meeting reconvened", "reconvened meeting", "recessed meeting", "reconvene meeting", "rescheduled hearing", "rescheduled meeting",) : status = api_event["status"] elif status_text in ("amended notice of meeting", "room change", "amended notice", "change of location", "revised - meeting date and time") : status = api_event["status"] elif "room" in status_text : location = status_string[1] + ", " + location elif status_text in ("wrong meeting date",): continue else : print(status_text) status = api_event["status"] else : status = api_event["status"] if event["Meeting Name"] == "Post Agenda": event_name = "Agenda Announcement" elif event["Meeting Name"] == "City Council": event_name = "Regular meeting" else: event_name = event["Meeting Name"] if description: e = Event(name=event_name, start_date=when, description=description, location_name=location, status=status) else: e = Event(name=event_name, start_date=when, location_name=location, status=status) e.pupa_id = str(api_event["EventId"]) if event["Meeting video"] != "Not\xa0available": if "url" not in event["Meeting video"]: pass else: video_url = self.get_meeting_video_link(event["Meeting video"]["url"]) e.add_media_link(note="Recording", url=video_url, type="recording", media_type="text/html") self.addDocs(e, event, "Published agenda") self.addDocs(e, event, "Published minutes") participant = event["Meeting Name"] if participant == "City Council" or participant == "Post Agenda": participant = "Pittsburgh City Council" e.add_participant(name=participant, type="organization") for item in self.agenda(api_event): clean_title = self.clean_agenda_item_title(item["EventItemTitle"]) agenda_item = e.add_agenda_item(clean_title) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemVideo"] and event["Meeting video"] != "Not\xa0available": item_video_url = self.get_meeting_video_link(event["Meeting video"]["url"]) + \ '?view_id=2&meta_id=' + str(item["EventItemVideo"]) agenda_item.add_media_link(note="Recording", url=item_video_url, type="recording", media_type="text/html") participants = set() for call in self.rollcalls(api_event): if call["RollCallValueName"] == "Present": participants.add(call["RollCallPersonName"]) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + "/events/{EventId}".format(**api_event), note="api") try: detail_url = event["Meeting Details"]["url"] except TypeError: e.add_source(self.EVENTSPAGE, note="web") else: if requests.head(detail_url).status_code == 200: e.add_source(detail_url, note="web") yield e
def scrape(self, window=None): if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] else: event_name = body_name # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." status_name = event['EventAgendaStatusName'] if status_name.startswith('Final'): status = 'passed' elif status_name == 'Draft': status = 'confirmed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} legistar_api_url = self.BASE_URL + '/events/{0}'.format( event['EventId']) e.add_source(legistar_api_url, note='api') if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format( item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. agenda_item['extras']['item_agenda_sequence'] = item[ 'EventItemAgendaSequence'] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. item_agenda_sequences = [ item['extras']['item_agenda_sequence'] for item in e.agenda ] if len(item_agenda_sequences) != len( set(item_agenda_sequences)): error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' raise ValueError( error_msg.format( event_name=e.name, event_date=e.start_date.strftime("%B %d, %Y"), legistar_api_url=legistar_api_url)) e.add_participant(name=body_name, type="organization") if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) for api_event, event in self.events(n_days_ago): description = None when = api_event['start'] location_string = event[u'Meeting Location'] location_list = location_string.split('--', 2) location = ', '.join(location_list[0:2]) if not location : continue status_string = location_list[-1].split('Chicago, Illinois') if len(status_string) > 1 and status_string[1] : status_text = status_string[1].lower() if any(phrase in status_text for phrase in ('rescheduled to', 'postponed to', 'reconvened to', 'rescheduled to', 'meeting recessed', 'recessed meeting', 'postponed to', 'recessed until', 'deferred', 'time change', 'date change', 'recessed meeting - reconvene', 'cancelled', 'new date and time', 'rescheduled indefinitely', 'rescheduled for',)) : status = 'cancelled' elif status_text in ('rescheduled', 'recessed') : status = 'cancelled' elif status_text in ('meeting reconvened', 'reconvened meeting', 'recessed meeting', 'reconvene meeting', 'rescheduled hearing', 'rescheduled meeting',) : status = api_event['status'] elif status_text in ('amended notice of meeting', 'room change', 'amended notice', 'change of location', 'revised - meeting date and time') : status = api_event['status'] elif 'room' in status_text : location = status_string[1] + ', ' + location elif status_text in ('wrong meeting date',) : continue else : print(status_text) description = status_string[1].replace('--em--', '').strip() status = api_event['status'] else : status = api_event['status'] if description : e = Event(name=event["Name"]["label"], start_date=when, description=description, location_name=location, status=status) else : e = Event(name=event["Name"]["label"], start_date=when, location_name=location, status=status) e.pupa_id = str(api_event['EventId']) if event['Video'] != 'Not\xa0available' : e.add_media_link(note='Recording', url = event['Video']['url'], type="recording", media_type = 'text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Notice') self.addDocs(e, event, 'Captions') self.addDocs(e, event, 'Summary') participant = event["Name"]["label"] if participant == 'City Council' : participant = 'Chicago City Council' elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)' : participant = 'Committee on Energy, Environmental Protection and Public Utilities' e.add_participant(name=participant, type="organization") for item in self.agenda(api_event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) participants = set() for call in self.rollcalls(api_event): if call['RollCallValueName'] == 'Present': participants.add(call['RollCallPersonName']) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**api_event), note='api') try: detail_url = event['Meeting Details']['url'] except TypeError: e.add_source(self.EVENTSPAGE, note='web') else: if requests.head(detail_url).status_code == 200: e.add_source(detail_url, note='web') yield e