def scrape_events_range(self, start_date, end_date): def daterange(start_date, end_date): number_of_days = int((end_date - start_date).days) for n in range(number_of_days): yield start_date + dt.timedelta(n) for date in daterange(start_date, end_date): calendar_day_url = CALENDAR_DAY_TEMPLATE.format( date.year, date.month - 1, date.day) events = self.extract_events_by_url(calendar_day_url) for event in events: tz = pytz.timezone("America/Toronto") time = dt.datetime.strptime(event['time'], '%I:%M %p') start = tz.localize( date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0)) org_name = event['meeting'] e = Event(name=org_name, start_time=start, timezone=tz.zone, location_name=event['location'], status=STATUS_DICT.get(event['meeting_status'])) e.extras = { 'meeting_number': event['no'], 'tmmis_meeting_id': event['meeting_id'], } e.add_source(calendar_day_url) e.add_participant( name=org_name, type='organization', ) def is_agenda_available(event): return event['publishing_status'] in [ 'Agenda Published', 'Minutes Published' ] def is_council(event): return True if event[ 'meeting'] == self.jurisdiction.name else False if is_agenda_available(event): agenda_url_template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council( event) else AGENDA_FULL_STANDARD_TEMPLATE agenda_url = agenda_url_template.format( event['meeting_id']) full_identifiers = list( self.full_identifiers(event['meeting_id'], is_council(event))) e.add_source(agenda_url) agenda_items = self.agenda_from_url(agenda_url) for i, item in enumerate(agenda_items): a = e.add_agenda_item(item['title']) a.add_classification(item['type'].lower()) a['order'] = str(i) def normalize_wards(raw): if not raw: raw = 'All' if raw == 'All': return raw.lower() else: return raw.split(', ') identifier_regex = re.compile( r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$') [full_identifier] = [ id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier'] ] a.add_bill(full_identifier) yield e
def scrape(self, window=None) : if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [part.strip() for part in body_name.split('-')] else: event_name = body_name status_name = event['EventAgendaStatusName'] if status_name == 'Draft': status = 'confirmed' elif status_name == 'Final': status = 'passed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format(item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) e.add_participant(name=body_name, type="organization") e.add_source(self.BASE_URL + '/events/{0}'.format(event['EventId']), note='api') if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note= 'Agenda', url = event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note= 'Minutes', url = event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for api_event, event in self.events(n_days_ago): when = api_event['start'] location = api_event['EventLocation'] description = event['Meeting\xa0Topic'] if any(each in description for each in ('Multiple meeting items', 'AGENDA TO BE ANNOUNCED')): description = None if description: e = Event(name=api_event["EventBodyName"], start_date=when, description=description, location_name=location, status=api_event['status']) else: e = Event(name=api_event["EventBodyName"], start_date=when, location_name=location, status=api_event['status']) e.pupa_id = str(api_event['EventId']) if event['Multimedia'] != 'Not\xa0available': e.add_media_link(note='Recording', url=event['Multimedia']['url'], type="recording", media_type='text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Minutes') location_string = event[u'Meeting Location'] location_notes, other_orgs = self._parse_location(location_string) if location_notes: e.extras = {'location note': ' '.join(location_notes)} if e.name == 'City Council Stated Meeting': participating_orgs = ['New York City Council'] elif 'committee' in e.name.lower(): participating_orgs = [e.name] else: participating_orgs = [] if other_orgs: other_orgs = re.sub('Jointl*y with the ', '', other_orgs) participating_orgs += re.split(' and the |, the ', other_orgs) for org in participating_orgs: e.add_committee(name=org) for item in self.agenda(api_event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) participants = set() for call in self.rollcalls(api_event): if call['RollCallValueName'] == 'Present': participants.add(call['RollCallPersonName'].strip()) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**api_event), note='api') try: detail_url = event['Meeting Details']['url'] except TypeError: e.add_source(self.EVENTSPAGE, note='web') else: if requests.head(detail_url).status_code == 200: e.add_source(detail_url, note='web') yield e
def scrape(self, window=None): if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) else: n_days_ago = None for event, web_event in self.events(n_days_ago): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] else: event_name = body_name status_name = event['EventAgendaStatusName'] if status_name == 'Draft': status = 'confirmed' elif status_name == 'Final': status = 'passed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' e = Event(event_name, start_date=event["start"], description='', location_name=event["EventLocation"], status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links e.extras = {'guid': event['EventGuid']} for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format( item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) e.add_participant(name=body_name, type="organization") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event), note='api') if event['EventAgendaFile']: e.add_document(note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf") # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists. if web_event['Audio'] != 'Not\xa0available': try: redirect_url = self.head( web_event['Audio']['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet contain the # location of the audio file. Skip these events, and retry # on next scrape. continue e.add_media_link(note=web_event['Audio']['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if web_event['Meeting Details'] != 'Meeting\xa0details': if requests.head(web_event['Meeting Details'] ['url']).status_code == 200: e.add_source(web_event['Meeting Details']['url'], note='web') else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, window=None): if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) else: n_days_ago = None events = self.events(since_datetime=n_days_ago) service_councils = set(sc['BodyId'] for sc in self.search( '/bodies/', 'BodyId', 'BodyTypeId eq 70 or BodyTypeId eq 75')) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] elif event['EventBodyId'] in service_councils: # Don't scrape service council or service council public hearing events. self.info('Skipping event {0} for {1}'.format( event['EventId'], event['EventBodyName'])) continue else: event_name = body_name # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." status_name = event['EventAgendaStatusName'] if status_name.startswith('Final'): status = 'passed' elif status_name == 'Draft': status = 'confirmed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} legistar_api_url = self.BASE_URL + '/events/{0}'.format( event['EventId']) e.add_source(legistar_api_url, note='api') if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if web_event.has_ecomment: self.info('Adding eComment link {0} from {1}'.format( web_event['eComment'], web_event['Meeting Details']['url'])) e.extras['ecomment'] = web_event['eComment'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes agenda_number = item["EventItemAgendaNumber"] note = "Agenda number, {}".format(agenda_number) agenda_item['notes'].append(note) agenda_item['extras']['agenda_number'] = agenda_number # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. agenda_item['extras']['item_agenda_sequence'] = item[ 'EventItemAgendaSequence'] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. item_agenda_sequences = [ item['extras']['item_agenda_sequence'] for item in e.agenda ] if len(item_agenda_sequences) != len( set(item_agenda_sequences)): error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' raise ValueError( error_msg.format( event_name=e.name, event_date=e.start_date.strftime("%B %d, %Y"), legistar_api_url=legistar_api_url)) e.add_participant(name=body_name, type="organization") if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document( note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf", date=self.to_utc_timestamp( event['EventAgendaLastPublishedUTC']).date()) if event['EventMinutesFile']: e.add_document( note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf", date=self.to_utc_timestamp( event['EventMinutesLastPublishedUTC']).date()) elif web_event['Published minutes'] != 'Not\xa0available': e.add_document(note=web_event['Published minutes']['label'], url=web_event['Published minutes']['url'], media_type="application/pdf") else: approved_minutes = self.find_approved_minutes(event) if approved_minutes: e.add_document( note=approved_minutes['MatterAttachmentName'], url=approved_minutes['MatterAttachmentHyperlink'], media_type="application/pdf", date=self.to_utc_timestamp( approved_minutes['MatterAttachmentLastModifiedUtc'] ).date()) for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue # Sometimes if there is an issue getting the Spanish # audio created, Metro has the Spanish Audio link # go to the English Audio. # # Pupa does not allow the for duplicate media links, # so we'll ignore the the second media link if it's # the same as the first media link. # # Because of the way that the event['audio'] is created # the first audio link is always English and the # second is always Spanish e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html', on_duplicate='ignore') if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape_events_range(self, start_date, end_date): def daterange(start_date, end_date): number_of_days = int((end_date - start_date).days) for n in range(number_of_days): yield start_date + datetime.timedelta(n) for date in daterange(start_date, end_date): calendar_day_url = CALENDAR_DAY_TEMPLATE.format(date.year, date.month - 1, date.day) events = self.extract_events_by_url(calendar_day_url) for event in events: tz = pytz.timezone("America/Toronto") time = datetime.datetime.strptime(event['time'], '%I:%M %p') start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0)) org_name = event['meeting'] e = Event( name=org_name, start_time=start, timezone=tz.zone, location_name=event['location'], status=STATUS_DICT.get(event['meeting_status']) ) e.extras = { 'meeting_number': event['no'], 'tmmis_meeting_id': event['meeting_id'], } e.add_source(calendar_day_url) e.add_participant( name=org_name, type='organization', ) def is_agenda_available(event): return event['publishing_status'] in ['Agenda Published', 'Minutes Published'] def is_council(event): return True if event['meeting'] == self.jurisdiction.name else False if is_agenda_available(event): agenda_url_template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE agenda_url = agenda_url_template.format(event['meeting_id']) full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event))) e.add_source(agenda_url) agenda_items = self.agenda_from_url(agenda_url) for i, item in enumerate(agenda_items): a = e.add_agenda_item(item['title']) a.add_classification(item['type'].lower()) a['order'] = str(i) def normalize_wards(raw): if not raw: raw = 'All' if raw == 'All': return raw.lower() else: return raw.split(', ') identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$') [full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']] a.add_bill(full_identifier) yield e
def scrape(self): last_events = deque(maxlen=10) for event, agenda in self.events(since=2017): other_orgs = '' extras = [] if '--em--' in event[u'Meeting Location']: location_string, note = event[u'Meeting Location'].split( '--em--')[:2] for each in note.split(' - '): if each.startswith('Join'): other_orgs = each else: extras.append(each) else: location_string = event[u'Meeting Location'] location_list = location_string.split('-', 2) location = ', '.join([each.strip() for each in location_list[0:2]]) if not location: continue when = self.toTime(event[u'Meeting Date']) response = self.get(event['iCalendar']['url'], verify=False) event_time = self.ical( response.text).subcomponents[0]['DTSTART'].dt when = when.replace(hour=event_time.hour, minute=event_time.minute) time_string = event['Meeting Time'] if time_string in ('Deferred', ): status = 'cancelled' elif self.now() < when: status = 'confirmed' else: status = 'passed' description = event['Meeting\xa0Topic'] if any(each in description for each in ('Multiple meeting items', 'AGENDA TO BE ANNOUNCED')): description = '' event_name = event['Name'] event_id = (event_name, when) if event_id in last_events: continue else: last_events.append(event_id) e = Event(name=event_name, start_date=when, description=description, location_name=location, status=status) if extras: e.extras = {'location note': ' '.join(extras)} if event['Multimedia'] != 'Not\xa0available': e.add_media_link(note='Recording', url=event['Multimedia']['url'], type="recording", media_type='text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Minutes') if event['Name'] == 'City Council Stated Meeting': participating_orgs = ['New York City Council'] elif 'committee' in event['Name'].lower(): participating_orgs = [event["Name"]] else: participating_orgs = [] if other_orgs: other_orgs = re.sub('Jointl*y with the ', '', other_orgs) participating_orgs += re.split(' and the |, the ', other_orgs) for org in participating_orgs: e.add_committee(name=org) if agenda: e.add_source(event["Meeting Details"]['url'], note='web') for item, _, _ in agenda: if item["Name"]: agenda_item = e.add_agenda_item(item["Name"]) if item["File\xa0#"]: if item['Action']: note = item['Action'] else: note = 'consideration' agenda_item.add_bill(item["File\xa0#"]['label'], note=note) else: e.add_source(self.EVENTSPAGE, note='web') yield e
def scrape(self, window=None) : if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [part.strip() for part in body_name.split('-')] else: event_name = body_name # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." status_name = event['EventAgendaStatusName'] if status_name.startswith('Final'): status = 'passed' elif status_name == 'Draft': status = 'confirmed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} legistar_api_url = self.BASE_URL + '/events/{0}'.format(event['EventId']) e.add_source(legistar_api_url, note='api') if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format(item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. agenda_item['extras']['item_agenda_sequence'] = item['EventItemAgendaSequence'] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. item_agenda_sequences = [item['extras']['item_agenda_sequence'] for item in e.agenda] if len(item_agenda_sequences) != len(set(item_agenda_sequences)): error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' raise ValueError(error_msg.format(event_name=e.name, event_date=e.start_date.strftime("%B %d, %Y"), legistar_api_url=legistar_api_url)) e.add_participant(name=body_name, type="organization") if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note= 'Agenda', url = event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note= 'Minutes', url = event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape_events_range(self, start_date, end_date): def daterange(start_date, end_date): number_of_days = int((end_date - start_date).days) for n in range(number_of_days): yield start_date + dt.timedelta(n) for date in daterange(start_date, end_date): events = self.extract_events_by_day(date) for event in events: tz = pytz.timezone("America/Toronto") time = dt.datetime.strptime(event['time'], '%I:%M %p') start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0)) source_url = CALENDAR_DAY_TEMPLATE.format(start.year, start.month, start.day) org_name = event['meeting'] e = Event( name = org_name, start_time = start, timezone = tz.zone, location_name = event['location'], status=STATUS_DICT.get(event['meeting_status']) ) e.add_source(source_url) e.extras = { 'meeting_number': event['no'], 'tmmis_meeting_id': event['meeting_id'], } e.add_participant( name = org_name, type = 'organization', ) def is_agenda_available(event): return event['publishing_status'] in ['Agenda Published', 'Minutes Published'] def is_council(event): return True if event['meeting'] == self.jurisdiction.name else False if is_agenda_available(event): template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE agenda_url = template.format(event['meeting_id']) full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event))) e.add_source(agenda_url) agenda_items = self.agenda_from_url(agenda_url) for i, item in enumerate(agenda_items): a = e.add_agenda_item(item['title']) a.add_classification(item['type'].lower()) a['order'] = str(i) def normalize_wards(raw): if not raw: raw = 'All' if raw == 'All': return raw.lower() else: return raw.split(', ') wards = normalize_wards(item['wards']) identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$') [full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']] a.add_bill(full_identifier) if full_identifier not in self.seen_agenda_items: b = Bill( # TODO: Fix this hardcode legislative_session = '2014-2018', identifier = full_identifier, title = item['title'], from_organization = {'name': self.jurisdiction.name}, ) b.add_source(agenda_url) b.add_document_link(note='canonical', media_type='text/html', url=AGENDA_ITEM_TEMPLATE.format(full_identifier)) b.extras = { 'wards': wards, } self.seen_agenda_items.append(full_identifier) yield b yield e
def lower_parse_page(self, url): page = self.lxmlize(url) tables = page.xpath("//table[@class='pubhrgtbl']") date = None for table in tables: metainf = {} rows = table.xpath(".//tr") for row in rows: tds = row.xpath("./*") if len(tds) < 2: continue key, value = tds if key.tag == "th" and key.get("class") == "hrgdate": date = key.text_content() date = re.sub(r"\s+", " ", date) date = re.sub(".*POSTPONED NEW DATE", "", date).strip() # Due to the html structure this shouldn't be an elif # It needs to fire twice in the same loop iteration if value.tag == "th" and value.get("class") == "commtitle": coms = value.xpath('.//div[contains(@class,"comm-txt")]/text()') elif key.tag == "td": key = key.text_content().strip() value = value.text_content().strip() value = value.replace(u"\x96", "-") value = re.sub(r"\s+", " ", value) metainf[key] = value time = metainf["Time:"] repl = {"A.M.": "AM", "P.M.": "PM"} drepl = {"Sept": "Sep"} for r in repl: time = time.replace(r, repl[r]) for r in drepl: date = date.replace(r, drepl[r]) time = re.sub("-.*", "", time) time = time.strip() year = dt.datetime.now().year date = "%s %s %s" % (date, year, time) if "tbd" in date.lower(): continue date = date.replace(" PLEASE NOTE NEW TIME", "") # Check if the event has been postponed. postponed = "POSTPONED" in date if postponed: date = date.replace(" POSTPONED", "") date_formats = ["%B %d %Y %I:%M %p", "%b. %d %Y %I:%M %p"] datetime = None for fmt in date_formats: try: datetime = dt.datetime.strptime(date, fmt) except ValueError: pass # If the datetime can't be parsed, bail. if datetime is None: return title_key = set(metainf) & set( [ "Public Hearing:", "Summit:", "Roundtable:", "Public Roundtable:", "Public Meeting:", "Public Forum:", "Meeting:", ] ) assert len(title_key) == 1, "Couldn't determine event title." title_key = list(title_key).pop() title = metainf[title_key] title = re.sub( r"\*\*Click here to view public hearing notice\*\*", "", title ) # If event was postponed, add a warning to the title. if postponed: title = "POSTPONED: %s" % title event = Event( name=title, start_date=self._tz.localize(datetime), location_name=metainf["Place:"], ) event.extras = {"contact": metainf["Contact:"]} if "Media Contact:" in metainf: event.extras.update(media_contact=metainf["Media Contact:"]) event.add_source(url) for com in coms: event.add_participant(com.strip(), type="committee", note="host") participant = event.participants[-1] participant["extras"] = ({"chamber": self.classify_committee(com)},) yield event
def scrape_events_range(self, start_date, end_date): def daterange(start_date, end_date): number_of_days = int((end_date - start_date).days) for n in range(number_of_days): yield start_date + dt.timedelta(n) for date in daterange(start_date, end_date): events = self.extract_events_by_day(date) for event in events: tz = pytz.timezone("America/Toronto") time = dt.datetime.strptime(event['time'], '%I:%M %p') start = tz.localize( date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0)) source_url = CALENDAR_DAY_TEMPLATE.format( start.year, start.month, start.day) org_name = event['meeting'] e = Event(name=org_name, start_time=start, timezone=tz.zone, location_name=event['location'], status=STATUS_DICT.get(event['meeting_status'])) e.add_source(source_url) e.extras = { 'meeting_number': event['no'], 'tmmis_meeting_id': event['meeting_id'], } e.add_participant( name=org_name, type='organization', ) def is_agenda_available(event): return event['publishing_status'] in [ 'Agenda Published', 'Minutes Published' ] def is_council(event): return True if event[ 'meeting'] == self.jurisdiction.name else False if is_agenda_available(event): template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council( event) else AGENDA_FULL_STANDARD_TEMPLATE agenda_url = template.format(event['meeting_id']) full_identifiers = list( self.full_identifiers(event['meeting_id'], is_council(event))) e.add_source(agenda_url) agenda_items = self.agenda_from_url(agenda_url) for i, item in enumerate(agenda_items): a = e.add_agenda_item(item['title']) a.add_classification(item['type'].lower()) a['order'] = str(i) def is_vote_event(item): return True if item['type'] == 'ACTION' else False def normalize_wards(raw): if not raw: raw = 'All' if raw == 'All': return raw.lower() else: return raw.split(', ') def is_being_introduced(item, event): org_name = event['meeting'] identifier = item['identifier'] # `org_code` is two-letter code for committee current_org_code = self.committees_by_name.get( org_name)[0]['code'] originating_org_code = re.search( r'([A-Z]{2})[0-9]+\.[0-9]+', identifier).group(1) return current_org_code == originating_org_code if is_vote_event(item): wards = normalize_wards(item['wards']) identifier_regex = re.compile( r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$') [full_identifier] = [ id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier'] ] a.add_bill(full_identifier) if is_being_introduced(item, event): b = Bill( # TODO: Fix this hardcode legislative_session='2014-2018', identifier=full_identifier, title=item['title'], from_organization={'name': org_name}, ) b.add_source(agenda_url) b.add_document_link( note='canonical', media_type='text/html', url=AGENDA_ITEM_TEMPLATE.format( full_identifier)) b.extras = { 'wards': wards, } yield b yield e
def scrape(self): last_events = deque(maxlen=10) for event, agenda in self.events(since=2011) : other_orgs = '' extras = [] if '--em--' in event[u'Meeting Location'] : location_string, note = event[u'Meeting Location'].split('--em--')[:2] for each in note.split(' - ') : if each.startswith('Join') : other_orgs = each else : extras.append(each) else : location_string = event[u'Meeting Location'] location_list = location_string.split('-', 2) location = ', '.join([each.strip() for each in location_list[0:2]]) if not location : continue when = self.toTime(event[u'Meeting Date']) event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt when = when.replace(hour=event_time.hour, minute=event_time.minute) time_string = event['Meeting Time'] if time_string in ('Deferred',) : status = 'cancelled' elif self.now() < when : status = 'confirmed' else : status = 'passed' description = event['Meeting\xa0Topic'] if any(each in description for each in ('Multiple meeting items', 'AGENDA TO BE ANNOUNCED')) : description = '' event_name = event['Name'] event_id = (event_name, when) if event_id in last_events : continue else : last_events.append(event_id) e = Event(name=event_name, start_time=when, timezone=self.TIMEZONE, description=description, location_name=location, status=status) if extras : e.extras = {'location note' : ' '.join(extras)} if event['Multimedia'] != 'Not\xa0available' : e.add_media_link(note='Recording', url = event['Multimedia']['url'], type="recording", media_type = 'text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Minutes') if event['Name'] == 'City Council Stated Meeting' : participating_orgs = ['New York City Council'] elif 'committee' in event['Name'].lower() : participating_orgs = [event["Name"]] else : participating_orgs = [] if other_orgs : other_orgs = re.sub('Jointl*y with the ', '', other_orgs) participating_orgs += re.split(' and the |, the ', other_orgs) for org in participating_orgs : e.add_committee(name=org) if agenda : e.add_source(event["Meeting Details"]['url']) for item, _, _ in agenda : if item["Name"] : agenda_item = e.add_agenda_item(item["Name"]) if item["File\xa0#"] : if item['Action'] : note = item['Action'] else : note = 'consideration' agenda_item.add_bill(item["File\xa0#"]['label'], note=note) else : e.add_source(self.EVENTSPAGE) yield e
def scrape(self, window=None): if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] else: event_name = body_name # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." status_name = event['EventAgendaStatusName'] if status_name.startswith('Final'): status = 'passed' elif status_name == 'Draft': status = 'confirmed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} legistar_api_url = self.BASE_URL + '/events/{0}'.format( event['EventId']) e.add_source(legistar_api_url, note='api') if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format( item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. agenda_item['extras']['item_agenda_sequence'] = item[ 'EventItemAgendaSequence'] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. item_agenda_sequences = [ item['extras']['item_agenda_sequence'] for item in e.agenda ] if len(item_agenda_sequences) != len( set(item_agenda_sequences)): error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' raise ValueError( error_msg.format( event_name=e.name, event_date=e.start_date.strftime("%B %d, %Y"), legistar_api_url=legistar_api_url)) e.add_participant(name=body_name, type="organization") if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def lower_parse_page(self, url): page = self.lxmlize(url) tables = page.xpath("//table[@class='pubhrgtbl']") date = None for table in tables: metainf = {} rows = table.xpath(".//tr") for row in rows: tds = row.xpath("./*") if len(tds) < 2: continue key, value = tds if key.tag == 'th' and key.get("class") == 'hrgdate': date = key.text_content() date = re.sub(r"\s+", " ", date) date = re.sub(".*POSTPONED NEW DATE", "", date).strip() # Due to the html structure this shouldn't be an elif # It needs to fire twice in the same loop iteration if value.tag == 'th' and value.get("class") == 'commtitle': coms = value.xpath('.//div[contains(@class,"comm-txt")]/text()') elif key.tag == 'td': key = key.text_content().strip() value = value.text_content().strip() value = value.replace(u'\x96', '-') value = re.sub(r"\s+", " ", value) metainf[key] = value time = metainf['Time:'] repl = { "A.M.": "AM", "P.M.": "PM", } drepl = { "Sept": "Sep" } for r in repl: time = time.replace(r, repl[r]) for r in drepl: date = date.replace(r, drepl[r]) time = re.sub("-.*", "", time) time = time.strip() year = dt.datetime.now().year date = "%s %s %s" % ( date, year, time ) if "tbd" in date.lower(): continue date = date.replace(' PLEASE NOTE NEW TIME', '') # Check if the event has been postponed. postponed = 'POSTPONED' in date if postponed: date = date.replace(' POSTPONED', '') date_formats = ["%B %d %Y %I:%M %p", "%b. %d %Y %I:%M %p"] datetime = None for fmt in date_formats: try: datetime = dt.datetime.strptime(date, fmt) except ValueError: pass # If the datetime can't be parsed, bail. if datetime is None: return title_key = set(metainf) & set([ 'Public Hearing:', 'Summit:', 'Roundtable:', 'Public Roundtable:', 'Public Meeting:', 'Public Forum:', 'Meeting:']) assert len(title_key) == 1, "Couldn't determine event title." title_key = list(title_key).pop() title = metainf[title_key] title = re.sub( r"\*\*Click here to view public hearing notice\*\*", "", title ) # If event was postponed, add a warning to the title. if postponed: title = 'POSTPONED: %s' % title event = Event( name=title, start_date=self._tz.localize(datetime), location_name=metainf['Place:'], ) event.extras = {'contact': metainf['Contact:']} if 'Media Contact:' in metainf: event.extras.update(media_contact=metainf['Media Contact:']) event.add_source(url) for com in coms: event.add_participant( com.strip(), type='committee', note='host', ) participant = event.participants[-1] participant['extras'] = {'chamber': self.classify_committee(com)}, yield event
def scrape_events_range(self, start_date, end_date): def daterange(start_date, end_date): number_of_days = int((end_date - start_date).days) for n in range(number_of_days): yield start_date + dt.timedelta(n) for date in daterange(start_date, end_date): calendar_day_url = CALENDAR_DAY_TEMPLATE.format(date.year, date.month - 1, date.day) events = self.extract_events_by_url(calendar_day_url) for event in events: tz = pytz.timezone("America/Toronto") time = dt.datetime.strptime(event["time"], "%I:%M %p") start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0)) org_name = event["meeting"] e = Event( name=org_name, start_time=start, timezone=tz.zone, location_name=event["location"], status=STATUS_DICT.get(event["meeting_status"]), ) e.extras = {"meeting_number": event["no"], "tmmis_meeting_id": event["meeting_id"]} e.add_source(calendar_day_url) e.add_participant(name=org_name, type="organization") def is_agenda_available(event): return event["publishing_status"] in ["Agenda Published", "Minutes Published"] def is_council(event): return True if event["meeting"] == self.jurisdiction.name else False if is_agenda_available(event): agenda_url_template = ( AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE ) agenda_url = agenda_url_template.format(event["meeting_id"]) full_identifiers = list(self.full_identifiers(event["meeting_id"], is_council(event))) event_map_url_template = ( "http://app.toronto.ca/tmmis/getAddressList.do?function=getMeetingAddressList&meetingId={}" ) event_map_url = event_map_url_template.format(event["meeting_id"]) addresses_d = self.addressesByAgendaId(event_map_url) e.add_source(agenda_url) agenda_items = self.agenda_from_url(agenda_url) for i, item in enumerate(agenda_items): a = e.add_agenda_item(item["title"]) a.add_classification(item["type"].lower()) a["order"] = str(i) def normalize_wards(raw): if not raw: raw = "All" if raw == "All": return raw.lower() else: return raw.split(", ") wards = normalize_wards(item["wards"]) identifier_regex = re.compile(r"^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$") [full_identifier] = [ id for id in full_identifiers if identifier_regex.match(id).group(1) == item["identifier"] ] a.add_bill(full_identifier) if full_identifier not in self.seen_agenda_items: b = Bill( # TODO: Fix this hardcode legislative_session="2014-2018", identifier=full_identifier, title=item["title"], from_organization={"name": self.jurisdiction.name}, ) b.add_source(agenda_url) b.add_document_link( note="canonical", media_type="text/html", url=AGENDA_ITEM_TEMPLATE.format(full_identifier), ) b.extras["wards"] = wards addresses = addresses_d.get(full_identifier) if addresses: b.extras["locations"] = [] for address in addresses: location = {"address": {"full_address": address}} b.extras["locations"].append(location) self.seen_agenda_items.append(full_identifier) yield b yield e