def scrape_events_range(self, start_date, end_date):
        def daterange(start_date, end_date):
            number_of_days = int((end_date - start_date).days)
            for n in range(number_of_days):
                yield start_date + dt.timedelta(n)

        for date in daterange(start_date, end_date):
            calendar_day_url = CALENDAR_DAY_TEMPLATE.format(
                date.year, date.month - 1, date.day)
            events = self.extract_events_by_url(calendar_day_url)
            for event in events:
                tz = pytz.timezone("America/Toronto")
                time = dt.datetime.strptime(event['time'], '%I:%M %p')
                start = tz.localize(
                    date.replace(hour=time.hour,
                                 minute=time.minute,
                                 second=0,
                                 microsecond=0))
                org_name = event['meeting']
                e = Event(name=org_name,
                          start_time=start,
                          timezone=tz.zone,
                          location_name=event['location'],
                          status=STATUS_DICT.get(event['meeting_status']))
                e.extras = {
                    'meeting_number': event['no'],
                    'tmmis_meeting_id': event['meeting_id'],
                }
                e.add_source(calendar_day_url)
                e.add_participant(
                    name=org_name,
                    type='organization',
                )

                def is_agenda_available(event):
                    return event['publishing_status'] in [
                        'Agenda Published', 'Minutes Published'
                    ]

                def is_council(event):
                    return True if event[
                        'meeting'] == self.jurisdiction.name else False

                if is_agenda_available(event):
                    agenda_url_template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(
                        event) else AGENDA_FULL_STANDARD_TEMPLATE
                    agenda_url = agenda_url_template.format(
                        event['meeting_id'])
                    full_identifiers = list(
                        self.full_identifiers(event['meeting_id'],
                                              is_council(event)))

                    e.add_source(agenda_url)
                    agenda_items = self.agenda_from_url(agenda_url)
                    for i, item in enumerate(agenda_items):

                        a = e.add_agenda_item(item['title'])
                        a.add_classification(item['type'].lower())
                        a['order'] = str(i)

                        def normalize_wards(raw):
                            if not raw:
                                raw = 'All'
                            if raw == 'All':
                                return raw.lower()
                            else:
                                return raw.split(', ')

                        identifier_regex = re.compile(
                            r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$')
                        [full_identifier] = [
                            id for id in full_identifiers
                            if identifier_regex.match(id).group(1) ==
                            item['identifier']
                        ]
                        a.add_bill(full_identifier)

                yield e
示例#2
0
    def scrape(self, window=None) :
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        else:
            n_days_ago = None

        events = self.events(n_days_ago)

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [part.strip()
                                         for part
                                         in body_name.split('-')]
            else:
                event_name = body_name

            status_name = event['EventAgendaStatusName']
            if status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Final':
                status = 'passed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            for item in self.agenda(event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

                if item["EventItemAgendaNumber"]:
                    # To the notes field, add the item number as given in the agenda minutes
                    note = "Agenda number, {}".format(item["EventItemAgendaNumber"])
                    agenda_item['notes'].append(note)

            e.add_participant(name=body_name,
                              type="organization")

            e.add_source(self.BASE_URL + '/events/{0}'.format(event['EventId']),
                         note='api')

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(note= 'Agenda',
                               url = event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note= 'Minutes',
                               url = event['EventMinutesFile'],
                               media_type="application/pdf")

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx', note='web')

            yield e
    def scrape(self, window=3):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for api_event, event in self.events(n_days_ago):

            when = api_event['start']
            location = api_event['EventLocation']

            description = event['Meeting\xa0Topic']

            if any(each in description for each in ('Multiple meeting items',
                                                    'AGENDA TO BE ANNOUNCED')):
                description = None

            if description:
                e = Event(name=api_event["EventBodyName"],
                          start_date=when,
                          description=description,
                          location_name=location,
                          status=api_event['status'])
            else:
                e = Event(name=api_event["EventBodyName"],
                          start_date=when,
                          location_name=location,
                          status=api_event['status'])

            e.pupa_id = str(api_event['EventId'])

            if event['Multimedia'] != 'Not\xa0available':
                e.add_media_link(note='Recording',
                                 url=event['Multimedia']['url'],
                                 type="recording",
                                 media_type='text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Minutes')

            location_string = event[u'Meeting Location']
            location_notes, other_orgs = self._parse_location(location_string)

            if location_notes:
                e.extras = {'location note': ' '.join(location_notes)}

            if e.name == 'City Council Stated Meeting':
                participating_orgs = ['New York City Council']
            elif 'committee' in e.name.lower():
                participating_orgs = [e.name]
            else:
                participating_orgs = []

            if other_orgs:
                other_orgs = re.sub('Jointl*y with the ', '', other_orgs)
                participating_orgs += re.split(' and the |, the ', other_orgs)

            for org in participating_orgs:
                e.add_committee(name=org)

            for item in self.agenda(api_event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

            participants = set()

            for call in self.rollcalls(api_event):
                if call['RollCallValueName'] == 'Present':
                    participants.add(call['RollCallPersonName'].strip())

            for person in participants:
                e.add_participant(name=person, type="person")

            e.add_source(self.BASE_URL +
                         '/events/{EventId}'.format(**api_event),
                         note='api')

            try:
                detail_url = event['Meeting Details']['url']
            except TypeError:
                e.add_source(self.EVENTSPAGE, note='web')
            else:
                if requests.head(detail_url).status_code == 200:
                    e.add_source(detail_url, note='web')

            yield e
示例#4
0
    def scrape(self, window=None):
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
        else:
            n_days_ago = None
        for event, web_event in self.events(n_days_ago):

            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [
                    part.strip() for part in body_name.split('-')
                ]
            else:
                event_name = body_name

            status_name = event['EventAgendaStatusName']
            if status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Final':
                status = 'passed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=event["EventLocation"],
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links
            e.extras = {'guid': event['EventGuid']}

            for item in self.agenda(event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

                if item["EventItemAgendaNumber"]:
                    # To the notes field, add the item number as given in the agenda minutes
                    note = "Agenda number, {}".format(
                        item["EventItemAgendaNumber"])
                    agenda_item['notes'].append(note)

            e.add_participant(name=body_name, type="organization")

            e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event),
                         note='api')

            if event['EventAgendaFile']:
                e.add_document(note='Agenda',
                               url=event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note='Minutes',
                               url=event['EventMinutesFile'],
                               media_type="application/pdf")

            # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists.
            if web_event['Audio'] != 'Not\xa0available':

                try:
                    redirect_url = self.head(
                        web_event['Audio']['url']).headers['Location']

                except KeyError:

                    # In some cases, the redirect URL does not yet contain the
                    # location of the audio file. Skip these events, and retry
                    # on next scrape.

                    continue

                e.add_media_link(note=web_event['Audio']['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if web_event['Meeting Details'] != 'Meeting\xa0details':
                if requests.head(web_event['Meeting Details']
                                 ['url']).status_code == 200:
                    e.add_source(web_event['Meeting Details']['url'],
                                 note='web')
                else:
                    e.add_source('https://metro.legistar.com/Calendar.aspx',
                                 note='web')

            yield e
示例#5
0
    def scrape(self, window=None):
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
        else:
            n_days_ago = None

        events = self.events(since_datetime=n_days_ago)

        service_councils = set(sc['BodyId'] for sc in self.search(
            '/bodies/', 'BodyId', 'BodyTypeId eq 70 or BodyTypeId eq 75'))

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [
                    part.strip() for part in body_name.split('-')
                ]
            elif event['EventBodyId'] in service_councils:
                # Don't scrape service council or service council public hearing events.
                self.info('Skipping event {0} for {1}'.format(
                    event['EventId'], event['EventBodyName']))
                continue
            else:
                event_name = body_name

            # Events can have an EventAgendaStatusName of "Final", "Final Revised",
            # and "Final 2nd Revised."
            # We classify these events as "passed."
            status_name = event['EventAgendaStatusName']
            if status_name.startswith('Final'):
                status = 'passed'
            elif status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            legistar_api_url = self.BASE_URL + '/events/{0}'.format(
                event['EventId'])
            e.add_source(legistar_api_url, note='api')

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            if web_event.has_ecomment:
                self.info('Adding eComment link {0} from {1}'.format(
                    web_event['eComment'],
                    web_event['Meeting Details']['url']))
                e.extras['ecomment'] = web_event['eComment']

            if 'event_details' in event:
                # if there is not a meeting detail page on legistar
                # don't capture the agenda data from the API
                for item in self.agenda(event):
                    agenda_item = e.add_agenda_item(item["EventItemTitle"])
                    if item["EventItemMatterFile"]:
                        identifier = item["EventItemMatterFile"]
                        agenda_item.add_bill(identifier)

                    if item["EventItemAgendaNumber"]:
                        # To the notes field, add the item number as given in the agenda minutes
                        agenda_number = item["EventItemAgendaNumber"]
                        note = "Agenda number, {}".format(agenda_number)
                        agenda_item['notes'].append(note)

                        agenda_item['extras']['agenda_number'] = agenda_number

                    # The EventItemAgendaSequence provides
                    # the line number of the Legistar agenda grid.
                    agenda_item['extras']['item_agenda_sequence'] = item[
                        'EventItemAgendaSequence']

                # Historically, the Legistar system has duplicated the EventItemAgendaSequence,
                # resulting in data inaccuracies. The scrape should fail in such cases, until Metro
                # cleans the data.
                item_agenda_sequences = [
                    item['extras']['item_agenda_sequence'] for item in e.agenda
                ]
                if len(item_agenda_sequences) != len(
                        set(item_agenda_sequences)):
                    error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
                        {event_name} on {event_date} ({legistar_api_url}). \
                        Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'

                    raise ValueError(
                        error_msg.format(
                            event_name=e.name,
                            event_date=e.start_date.strftime("%B %d, %Y"),
                            legistar_api_url=legistar_api_url))

            e.add_participant(name=body_name, type="organization")

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL +
                             '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(
                    note='Agenda',
                    url=event['EventAgendaFile'],
                    media_type="application/pdf",
                    date=self.to_utc_timestamp(
                        event['EventAgendaLastPublishedUTC']).date())

            if event['EventMinutesFile']:
                e.add_document(
                    note='Minutes',
                    url=event['EventMinutesFile'],
                    media_type="application/pdf",
                    date=self.to_utc_timestamp(
                        event['EventMinutesLastPublishedUTC']).date())
            elif web_event['Published minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Published minutes']['label'],
                               url=web_event['Published minutes']['url'],
                               media_type="application/pdf")
            else:
                approved_minutes = self.find_approved_minutes(event)
                if approved_minutes:
                    e.add_document(
                        note=approved_minutes['MatterAttachmentName'],
                        url=approved_minutes['MatterAttachmentHyperlink'],
                        media_type="application/pdf",
                        date=self.to_utc_timestamp(
                            approved_minutes['MatterAttachmentLastModifiedUtc']
                        ).date())

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                # Sometimes if there is an issue getting the Spanish
                # audio created, Metro has the Spanish Audio link
                # go to the English Audio.
                #
                # Pupa does not allow the for duplicate media links,
                # so we'll ignore the the second media link if it's
                # the same as the first media link.
                #
                # Because of the way that the event['audio'] is created
                # the first audio link is always English and the
                # second is always Spanish
                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html',
                                 on_duplicate='ignore')

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx',
                             note='web')

            yield e
    def scrape_events_range(self, start_date, end_date):

        def daterange(start_date, end_date):
            number_of_days = int((end_date - start_date).days)
            for n in range(number_of_days):
                yield start_date + datetime.timedelta(n)

        for date in daterange(start_date, end_date):
            calendar_day_url = CALENDAR_DAY_TEMPLATE.format(date.year, date.month - 1, date.day)
            events = self.extract_events_by_url(calendar_day_url)
            for event in events:
                tz = pytz.timezone("America/Toronto")
                time = datetime.datetime.strptime(event['time'], '%I:%M %p')
                start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0))
                org_name = event['meeting']
                e = Event(
                    name=org_name,
                    start_time=start,
                    timezone=tz.zone,
                    location_name=event['location'],
                    status=STATUS_DICT.get(event['meeting_status'])
                )
                e.extras = {
                    'meeting_number': event['no'],
                    'tmmis_meeting_id': event['meeting_id'],
                }
                e.add_source(calendar_day_url)
                e.add_participant(
                    name=org_name,
                    type='organization',
                )

                def is_agenda_available(event):
                    return event['publishing_status'] in ['Agenda Published', 'Minutes Published']

                def is_council(event):
                    return True if event['meeting'] == self.jurisdiction.name else False

                if is_agenda_available(event):
                    agenda_url_template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE
                    agenda_url = agenda_url_template.format(event['meeting_id'])
                    full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event)))

                    e.add_source(agenda_url)
                    agenda_items = self.agenda_from_url(agenda_url)
                    for i, item in enumerate(agenda_items):

                        a = e.add_agenda_item(item['title'])
                        a.add_classification(item['type'].lower())
                        a['order'] = str(i)

                        def normalize_wards(raw):
                            if not raw:
                                raw = 'All'
                            if raw == 'All':
                                return raw.lower()
                            else:
                                return raw.split(', ')

                        identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$')
                        [full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']]
                        a.add_bill(full_identifier)

                yield e
    def scrape(self):
        last_events = deque(maxlen=10)
        for event, agenda in self.events(since=2017):
            other_orgs = ''
            extras = []

            if '--em--' in event[u'Meeting Location']:
                location_string, note = event[u'Meeting Location'].split(
                    '--em--')[:2]
                for each in note.split(' - '):
                    if each.startswith('Join'):
                        other_orgs = each
                    else:
                        extras.append(each)
            else:
                location_string = event[u'Meeting Location']

            location_list = location_string.split('-', 2)
            location = ', '.join([each.strip() for each in location_list[0:2]])
            if not location:
                continue

            when = self.toTime(event[u'Meeting Date'])

            response = self.get(event['iCalendar']['url'], verify=False)
            event_time = self.ical(
                response.text).subcomponents[0]['DTSTART'].dt
            when = when.replace(hour=event_time.hour, minute=event_time.minute)

            time_string = event['Meeting Time']
            if time_string in ('Deferred', ):
                status = 'cancelled'
            elif self.now() < when:
                status = 'confirmed'
            else:
                status = 'passed'

            description = event['Meeting\xa0Topic']
            if any(each in description for each in ('Multiple meeting items',
                                                    'AGENDA TO BE ANNOUNCED')):
                description = ''

            event_name = event['Name']

            event_id = (event_name, when)

            if event_id in last_events:
                continue
            else:
                last_events.append(event_id)

            e = Event(name=event_name,
                      start_date=when,
                      description=description,
                      location_name=location,
                      status=status)

            if extras:
                e.extras = {'location note': ' '.join(extras)}

            if event['Multimedia'] != 'Not\xa0available':
                e.add_media_link(note='Recording',
                                 url=event['Multimedia']['url'],
                                 type="recording",
                                 media_type='text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Minutes')

            if event['Name'] == 'City Council Stated Meeting':
                participating_orgs = ['New York City Council']
            elif 'committee' in event['Name'].lower():
                participating_orgs = [event["Name"]]
            else:
                participating_orgs = []

            if other_orgs:
                other_orgs = re.sub('Jointl*y with the ', '', other_orgs)
                participating_orgs += re.split(' and the |, the ', other_orgs)

            for org in participating_orgs:
                e.add_committee(name=org)

            if agenda:
                e.add_source(event["Meeting Details"]['url'], note='web')

                for item, _, _ in agenda:
                    if item["Name"]:
                        agenda_item = e.add_agenda_item(item["Name"])
                        if item["File\xa0#"]:
                            if item['Action']:
                                note = item['Action']
                            else:
                                note = 'consideration'
                            agenda_item.add_bill(item["File\xa0#"]['label'],
                                                 note=note)
            else:
                e.add_source(self.EVENTSPAGE, note='web')

            yield e
示例#8
0
    def scrape(self, window=None) :
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        else:
            n_days_ago = None

        events = self.events(n_days_ago)

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [part.strip()
                                         for part
                                         in body_name.split('-')]
            else:
                event_name = body_name

            # Events can have an EventAgendaStatusName of "Final", "Final Revised", 
            # and "Final 2nd Revised."
            # We classify these events as "passed."
            status_name = event['EventAgendaStatusName']
            if status_name.startswith('Final'):
                status = 'passed'
            elif status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            legistar_api_url = self.BASE_URL + '/events/{0}'.format(event['EventId'])
            e.add_source(legistar_api_url, note='api')

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            if 'event_details' in event:
                # if there is not a meeting detail page on legistar
                # don't capture the agenda data from the API
                for item in self.agenda(event):
                    agenda_item = e.add_agenda_item(item["EventItemTitle"])
                    if item["EventItemMatterFile"]:
                        identifier = item["EventItemMatterFile"]
                        agenda_item.add_bill(identifier)

                    if item["EventItemAgendaNumber"]:
                        # To the notes field, add the item number as given in the agenda minutes
                        note = "Agenda number, {}".format(item["EventItemAgendaNumber"])
                        agenda_item['notes'].append(note)

                    # The EventItemAgendaSequence provides 
                    # the line number of the Legistar agenda grid.
                    agenda_item['extras']['item_agenda_sequence'] = item['EventItemAgendaSequence']

                # Historically, the Legistar system has duplicated the EventItemAgendaSequence,
                # resulting in data inaccuracies. The scrape should fail in such cases, until Metro
                # cleans the data.
                item_agenda_sequences = [item['extras']['item_agenda_sequence'] for item in e.agenda]
                if len(item_agenda_sequences) != len(set(item_agenda_sequences)):
                    error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
                        {event_name} on {event_date} ({legistar_api_url}). \
                        Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'

                    raise ValueError(error_msg.format(event_name=e.name, 
                                                      event_date=e.start_date.strftime("%B %d, %Y"),
                                                      legistar_api_url=legistar_api_url))

            e.add_participant(name=body_name,
                              type="organization")

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(note= 'Agenda',
                               url = event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note= 'Minutes',
                               url = event['EventMinutesFile'],
                               media_type="application/pdf")

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx', note='web')

            yield e
    def scrape_events_range(self, start_date, end_date):

        def daterange(start_date, end_date):
            number_of_days = int((end_date - start_date).days)
            for n in range(number_of_days):
                yield start_date + dt.timedelta(n)

        for date in daterange(start_date, end_date):
            events = self.extract_events_by_day(date)
            for event in events:
                tz = pytz.timezone("America/Toronto")
                time = dt.datetime.strptime(event['time'], '%I:%M %p')
                start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0))
                source_url = CALENDAR_DAY_TEMPLATE.format(start.year, start.month, start.day)
                org_name = event['meeting']
                e = Event(
                    name = org_name,
                    start_time = start,
                    timezone = tz.zone,
                    location_name = event['location'],
                    status=STATUS_DICT.get(event['meeting_status'])
                    )
                e.add_source(source_url)
                e.extras = {
                    'meeting_number': event['no'],
                    'tmmis_meeting_id': event['meeting_id'],
                    }
                e.add_participant(
                    name = org_name,
                    type = 'organization',
                    )

                def is_agenda_available(event):
                    return event['publishing_status'] in ['Agenda Published', 'Minutes Published']

                def is_council(event):
                    return True if event['meeting'] == self.jurisdiction.name else False

                if is_agenda_available(event):
                    template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE
                    agenda_url = template.format(event['meeting_id'])
                    full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event)))

                    e.add_source(agenda_url)
                    agenda_items = self.agenda_from_url(agenda_url)
                    for i, item in enumerate(agenda_items):

                        a = e.add_agenda_item(item['title'])
                        a.add_classification(item['type'].lower())
                        a['order'] = str(i)

                        def normalize_wards(raw):
                            if not raw: raw = 'All'
                            if raw == 'All':
                                return raw.lower()
                            else:
                                return raw.split(', ')

                        wards = normalize_wards(item['wards'])
                        identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$')
                        [full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']]
                        a.add_bill(full_identifier)
                        if full_identifier not in self.seen_agenda_items:
                            b = Bill(
                                # TODO: Fix this hardcode
                                legislative_session = '2014-2018',
                                identifier = full_identifier,
                                title = item['title'],
                                from_organization = {'name': self.jurisdiction.name},
                                )
                            b.add_source(agenda_url)
                            b.add_document_link(note='canonical', media_type='text/html', url=AGENDA_ITEM_TEMPLATE.format(full_identifier))
                            b.extras = {
                                'wards': wards,
                                }

                            self.seen_agenda_items.append(full_identifier)

                            yield b

                yield e
示例#10
0
    def lower_parse_page(self, url):
        page = self.lxmlize(url)
        tables = page.xpath("//table[@class='pubhrgtbl']")
        date = None
        for table in tables:
            metainf = {}
            rows = table.xpath(".//tr")
            for row in rows:
                tds = row.xpath("./*")
                if len(tds) < 2:
                    continue
                key, value = tds

                if key.tag == "th" and key.get("class") == "hrgdate":
                    date = key.text_content()
                    date = re.sub(r"\s+", " ", date)
                    date = re.sub(".*POSTPONED NEW DATE", "", date).strip()

                # Due to the html structure this shouldn't be an elif
                # It needs to fire twice in the same loop iteration
                if value.tag == "th" and value.get("class") == "commtitle":
                    coms = value.xpath('.//div[contains(@class,"comm-txt")]/text()')

                elif key.tag == "td":
                    key = key.text_content().strip()
                    value = value.text_content().strip()
                    value = value.replace(u"\x96", "-")
                    value = re.sub(r"\s+", " ", value)
                    metainf[key] = value

            time = metainf["Time:"]
            repl = {"A.M.": "AM", "P.M.": "PM"}
            drepl = {"Sept": "Sep"}
            for r in repl:
                time = time.replace(r, repl[r])

            for r in drepl:
                date = date.replace(r, drepl[r])

            time = re.sub("-.*", "", time)
            time = time.strip()

            year = dt.datetime.now().year

            date = "%s %s %s" % (date, year, time)

            if "tbd" in date.lower():
                continue

            date = date.replace(" PLEASE NOTE NEW TIME", "")

            # Check if the event has been postponed.
            postponed = "POSTPONED" in date
            if postponed:
                date = date.replace(" POSTPONED", "")

            date_formats = ["%B %d %Y %I:%M %p", "%b. %d %Y %I:%M %p"]
            datetime = None
            for fmt in date_formats:
                try:
                    datetime = dt.datetime.strptime(date, fmt)
                except ValueError:
                    pass

            # If the datetime can't be parsed, bail.
            if datetime is None:
                return

            title_key = set(metainf) & set(
                [
                    "Public Hearing:",
                    "Summit:",
                    "Roundtable:",
                    "Public Roundtable:",
                    "Public Meeting:",
                    "Public Forum:",
                    "Meeting:",
                ]
            )
            assert len(title_key) == 1, "Couldn't determine event title."
            title_key = list(title_key).pop()
            title = metainf[title_key]

            title = re.sub(
                r"\*\*Click here to view public hearing notice\*\*", "", title
            )

            # If event was postponed, add a warning to the title.
            if postponed:
                title = "POSTPONED: %s" % title

            event = Event(
                name=title,
                start_date=self._tz.localize(datetime),
                location_name=metainf["Place:"],
            )
            event.extras = {"contact": metainf["Contact:"]}
            if "Media Contact:" in metainf:
                event.extras.update(media_contact=metainf["Media Contact:"])
            event.add_source(url)

            for com in coms:
                event.add_participant(com.strip(), type="committee", note="host")
                participant = event.participants[-1]
                participant["extras"] = ({"chamber": self.classify_committee(com)},)

            yield event
示例#11
0
    def scrape_events_range(self, start_date, end_date):
        def daterange(start_date, end_date):
            number_of_days = int((end_date - start_date).days)
            for n in range(number_of_days):
                yield start_date + dt.timedelta(n)

        for date in daterange(start_date, end_date):
            events = self.extract_events_by_day(date)
            for event in events:
                tz = pytz.timezone("America/Toronto")
                time = dt.datetime.strptime(event['time'], '%I:%M %p')
                start = tz.localize(
                    date.replace(hour=time.hour,
                                 minute=time.minute,
                                 second=0,
                                 microsecond=0))
                source_url = CALENDAR_DAY_TEMPLATE.format(
                    start.year, start.month, start.day)
                org_name = event['meeting']
                e = Event(name=org_name,
                          start_time=start,
                          timezone=tz.zone,
                          location_name=event['location'],
                          status=STATUS_DICT.get(event['meeting_status']))
                e.add_source(source_url)
                e.extras = {
                    'meeting_number': event['no'],
                    'tmmis_meeting_id': event['meeting_id'],
                }
                e.add_participant(
                    name=org_name,
                    type='organization',
                )

                def is_agenda_available(event):
                    return event['publishing_status'] in [
                        'Agenda Published', 'Minutes Published'
                    ]

                def is_council(event):
                    return True if event[
                        'meeting'] == self.jurisdiction.name else False

                if is_agenda_available(event):
                    template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(
                        event) else AGENDA_FULL_STANDARD_TEMPLATE
                    agenda_url = template.format(event['meeting_id'])
                    full_identifiers = list(
                        self.full_identifiers(event['meeting_id'],
                                              is_council(event)))

                    e.add_source(agenda_url)
                    agenda_items = self.agenda_from_url(agenda_url)
                    for i, item in enumerate(agenda_items):

                        a = e.add_agenda_item(item['title'])
                        a.add_classification(item['type'].lower())
                        a['order'] = str(i)

                        def is_vote_event(item):
                            return True if item['type'] == 'ACTION' else False

                        def normalize_wards(raw):
                            if not raw: raw = 'All'
                            if raw == 'All':
                                return raw.lower()
                            else:
                                return raw.split(', ')

                        def is_being_introduced(item, event):
                            org_name = event['meeting']
                            identifier = item['identifier']

                            # `org_code` is two-letter code for committee
                            current_org_code = self.committees_by_name.get(
                                org_name)[0]['code']
                            originating_org_code = re.search(
                                r'([A-Z]{2})[0-9]+\.[0-9]+',
                                identifier).group(1)

                            return current_org_code == originating_org_code

                        if is_vote_event(item):
                            wards = normalize_wards(item['wards'])
                            identifier_regex = re.compile(
                                r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$')
                            [full_identifier] = [
                                id for id in full_identifiers
                                if identifier_regex.match(id).group(1) ==
                                item['identifier']
                            ]
                            a.add_bill(full_identifier)
                            if is_being_introduced(item, event):
                                b = Bill(
                                    # TODO: Fix this hardcode
                                    legislative_session='2014-2018',
                                    identifier=full_identifier,
                                    title=item['title'],
                                    from_organization={'name': org_name},
                                )
                                b.add_source(agenda_url)
                                b.add_document_link(
                                    note='canonical',
                                    media_type='text/html',
                                    url=AGENDA_ITEM_TEMPLATE.format(
                                        full_identifier))
                                b.extras = {
                                    'wards': wards,
                                }

                                yield b

                yield e
示例#12
0
    def scrape(self):
        last_events = deque(maxlen=10)
        for event, agenda in self.events(since=2011) :
            other_orgs = ''
            extras = []

            if '--em--' in event[u'Meeting Location'] :
                location_string, note = event[u'Meeting Location'].split('--em--')[:2]
                for each in note.split(' - ') :
                    if each.startswith('Join') :
                        other_orgs = each
                    else :
                        extras.append(each)
            else :
                location_string = event[u'Meeting Location'] 
            
            location_list = location_string.split('-', 2)
            location = ', '.join([each.strip() for each in location_list[0:2]])
            if not location :
                continue

            when = self.toTime(event[u'Meeting Date'])

            event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt
            when = when.replace(hour=event_time.hour,
                                minute=event_time.minute)

            time_string = event['Meeting Time']
            if time_string in ('Deferred',) :
                status = 'cancelled'
            elif self.now() < when :
                status = 'confirmed'
            else :
                status = 'passed'

            description = event['Meeting\xa0Topic']
            if any(each in description 
                   for each 
                   in ('Multiple meeting items',
                       'AGENDA TO BE ANNOUNCED')) :
                description = ''

            event_name = event['Name']

            event_id = (event_name, when)

            if event_id in last_events :
                continue
            else :
                last_events.append(event_id)

            e = Event(name=event_name,
                      start_time=when,
                      timezone=self.TIMEZONE,
                      description=description,
                      location_name=location,
                      status=status)

            if extras :
                e.extras = {'location note' : ' '.join(extras)}

            if event['Multimedia'] != 'Not\xa0available' : 
                e.add_media_link(note='Recording',
                                 url = event['Multimedia']['url'],
                                 type="recording",
                                 media_type = 'text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Minutes')

            if event['Name'] == 'City Council Stated Meeting' :
                participating_orgs = ['New York City Council']
            elif 'committee' in event['Name'].lower() :
                participating_orgs = [event["Name"]]
            else :
                participating_orgs = []

            if other_orgs : 
                other_orgs = re.sub('Jointl*y with the ', '', other_orgs)
                participating_orgs += re.split(' and the |, the ', other_orgs)
 
            for org in participating_orgs :
                e.add_committee(name=org)

            if agenda :
                e.add_source(event["Meeting Details"]['url'])

                
                for item, _, _ in agenda :
                    if item["Name"] :
                        agenda_item = e.add_agenda_item(item["Name"])
                        if item["File\xa0#"] :
                            if item['Action'] :
                                note = item['Action']
                            else :
                                note = 'consideration'
                            agenda_item.add_bill(item["File\xa0#"]['label'],
                                                 note=note)
            else :
                e.add_source(self.EVENTSPAGE)

            yield e
示例#13
0
    def scrape(self, window=None):
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
        else:
            n_days_ago = None

        events = self.events(n_days_ago)

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [
                    part.strip() for part in body_name.split('-')
                ]
            else:
                event_name = body_name

            # Events can have an EventAgendaStatusName of "Final", "Final Revised",
            # and "Final 2nd Revised."
            # We classify these events as "passed."
            status_name = event['EventAgendaStatusName']
            if status_name.startswith('Final'):
                status = 'passed'
            elif status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            legistar_api_url = self.BASE_URL + '/events/{0}'.format(
                event['EventId'])
            e.add_source(legistar_api_url, note='api')

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            if 'event_details' in event:
                # if there is not a meeting detail page on legistar
                # don't capture the agenda data from the API
                for item in self.agenda(event):
                    agenda_item = e.add_agenda_item(item["EventItemTitle"])
                    if item["EventItemMatterFile"]:
                        identifier = item["EventItemMatterFile"]
                        agenda_item.add_bill(identifier)

                    if item["EventItemAgendaNumber"]:
                        # To the notes field, add the item number as given in the agenda minutes
                        note = "Agenda number, {}".format(
                            item["EventItemAgendaNumber"])
                        agenda_item['notes'].append(note)

                    # The EventItemAgendaSequence provides
                    # the line number of the Legistar agenda grid.
                    agenda_item['extras']['item_agenda_sequence'] = item[
                        'EventItemAgendaSequence']

                # Historically, the Legistar system has duplicated the EventItemAgendaSequence,
                # resulting in data inaccuracies. The scrape should fail in such cases, until Metro
                # cleans the data.
                item_agenda_sequences = [
                    item['extras']['item_agenda_sequence'] for item in e.agenda
                ]
                if len(item_agenda_sequences) != len(
                        set(item_agenda_sequences)):
                    error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
                        {event_name} on {event_date} ({legistar_api_url}). \
                        Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'

                    raise ValueError(
                        error_msg.format(
                            event_name=e.name,
                            event_date=e.start_date.strftime("%B %d, %Y"),
                            legistar_api_url=legistar_api_url))

            e.add_participant(name=body_name, type="organization")

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL +
                             '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(note='Agenda',
                               url=event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note='Minutes',
                               url=event['EventMinutesFile'],
                               media_type="application/pdf")

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx',
                             note='web')

            yield e
示例#14
0
    def lower_parse_page(self, url):
        page = self.lxmlize(url)
        tables = page.xpath("//table[@class='pubhrgtbl']")
        date = None
        for table in tables:
            metainf = {}
            rows = table.xpath(".//tr")
            for row in rows:
                tds = row.xpath("./*")
                if len(tds) < 2:
                    continue
                key, value = tds

                if key.tag == 'th' and key.get("class") == 'hrgdate':
                    date = key.text_content()
                    date = re.sub(r"\s+", " ", date)
                    date = re.sub(".*POSTPONED NEW DATE", "", date).strip()

                # Due to the html structure this shouldn't be an elif
                # It needs to fire twice in the same loop iteration
                if value.tag == 'th' and value.get("class") == 'commtitle':
                    coms = value.xpath('.//div[contains(@class,"comm-txt")]/text()')

                elif key.tag == 'td':
                    key = key.text_content().strip()
                    value = value.text_content().strip()
                    value = value.replace(u'\x96', '-')
                    value = re.sub(r"\s+", " ", value)
                    metainf[key] = value

            time = metainf['Time:']
            repl = {
                "A.M.": "AM",
                "P.M.": "PM",
            }
            drepl = {
                "Sept": "Sep"
            }
            for r in repl:
                time = time.replace(r, repl[r])

            for r in drepl:
                date = date.replace(r, drepl[r])

            time = re.sub("-.*", "", time)
            time = time.strip()

            year = dt.datetime.now().year

            date = "%s %s %s" % (
                date,
                year,
                time
            )

            if "tbd" in date.lower():
                continue

            date = date.replace(' PLEASE NOTE NEW TIME', '')

            # Check if the event has been postponed.
            postponed = 'POSTPONED' in date
            if postponed:
                date = date.replace(' POSTPONED', '')

            date_formats = ["%B %d %Y %I:%M %p", "%b. %d %Y %I:%M %p"]
            datetime = None
            for fmt in date_formats:
                try:
                    datetime = dt.datetime.strptime(date, fmt)
                except ValueError:
                    pass

            # If the datetime can't be parsed, bail.
            if datetime is None:
                return

            title_key = set(metainf) & set([
                'Public Hearing:', 'Summit:', 'Roundtable:',
                'Public Roundtable:', 'Public Meeting:', 'Public Forum:',
                'Meeting:'])
            assert len(title_key) == 1, "Couldn't determine event title."
            title_key = list(title_key).pop()
            title = metainf[title_key]

            title = re.sub(
                r"\*\*Click here to view public hearing notice\*\*",
                "",
                title
            )

            # If event was postponed, add a warning to the title.
            if postponed:
                title = 'POSTPONED: %s' % title

            event = Event(
                name=title,
                start_date=self._tz.localize(datetime),
                location_name=metainf['Place:'],
            )
            event.extras = {'contact': metainf['Contact:']}
            if 'Media Contact:' in metainf:
                event.extras.update(media_contact=metainf['Media Contact:'])
            event.add_source(url)

            for com in coms:
                event.add_participant(
                    com.strip(),
                    type='committee',
                    note='host',
                )
                participant = event.participants[-1]
                participant['extras'] = {'chamber': self.classify_committee(com)},

            yield event
    def scrape_events_range(self, start_date, end_date):
        def daterange(start_date, end_date):
            number_of_days = int((end_date - start_date).days)
            for n in range(number_of_days):
                yield start_date + dt.timedelta(n)

        for date in daterange(start_date, end_date):
            calendar_day_url = CALENDAR_DAY_TEMPLATE.format(date.year, date.month - 1, date.day)
            events = self.extract_events_by_url(calendar_day_url)
            for event in events:
                tz = pytz.timezone("America/Toronto")
                time = dt.datetime.strptime(event["time"], "%I:%M %p")
                start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0))
                org_name = event["meeting"]
                e = Event(
                    name=org_name,
                    start_time=start,
                    timezone=tz.zone,
                    location_name=event["location"],
                    status=STATUS_DICT.get(event["meeting_status"]),
                )
                e.extras = {"meeting_number": event["no"], "tmmis_meeting_id": event["meeting_id"]}
                e.add_source(calendar_day_url)
                e.add_participant(name=org_name, type="organization")

                def is_agenda_available(event):
                    return event["publishing_status"] in ["Agenda Published", "Minutes Published"]

                def is_council(event):
                    return True if event["meeting"] == self.jurisdiction.name else False

                if is_agenda_available(event):
                    agenda_url_template = (
                        AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE
                    )
                    agenda_url = agenda_url_template.format(event["meeting_id"])
                    full_identifiers = list(self.full_identifiers(event["meeting_id"], is_council(event)))

                    event_map_url_template = (
                        "http://app.toronto.ca/tmmis/getAddressList.do?function=getMeetingAddressList&meetingId={}"
                    )
                    event_map_url = event_map_url_template.format(event["meeting_id"])
                    addresses_d = self.addressesByAgendaId(event_map_url)

                    e.add_source(agenda_url)
                    agenda_items = self.agenda_from_url(agenda_url)
                    for i, item in enumerate(agenda_items):

                        a = e.add_agenda_item(item["title"])
                        a.add_classification(item["type"].lower())
                        a["order"] = str(i)

                        def normalize_wards(raw):
                            if not raw:
                                raw = "All"
                            if raw == "All":
                                return raw.lower()
                            else:
                                return raw.split(", ")

                        wards = normalize_wards(item["wards"])
                        identifier_regex = re.compile(r"^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$")
                        [full_identifier] = [
                            id for id in full_identifiers if identifier_regex.match(id).group(1) == item["identifier"]
                        ]
                        a.add_bill(full_identifier)
                        if full_identifier not in self.seen_agenda_items:
                            b = Bill(
                                # TODO: Fix this hardcode
                                legislative_session="2014-2018",
                                identifier=full_identifier,
                                title=item["title"],
                                from_organization={"name": self.jurisdiction.name},
                            )
                            b.add_source(agenda_url)
                            b.add_document_link(
                                note="canonical",
                                media_type="text/html",
                                url=AGENDA_ITEM_TEMPLATE.format(full_identifier),
                            )
                            b.extras["wards"] = wards

                            addresses = addresses_d.get(full_identifier)
                            if addresses:
                                b.extras["locations"] = []
                                for address in addresses:
                                    location = {"address": {"full_address": address}}
                                    b.extras["locations"].append(location)

                            self.seen_agenda_items.append(full_identifier)

                            yield b

                yield e