def get_events(self): if self.session != self.get_current_session(): raise Exception("Can't do that, dude") url = "http://meetingrecords.cityofboston.gov/sirepub/meetresults.aspx" page = self.lxmlize(url) for entry in page.xpath( "//tr[@style='font-family: Verdana; font-size: 12px;']"): name, when, links = entry.xpath(".//td") name = name.text.strip().replace(u"\xc2\xa0", "") when = when.text.strip().replace(u"\xc2\xa0", "") when = dt.datetime.strptime(when, "%m/%d/%Y") links = links.xpath(".//a") links = {x.text: x.attrib['href'] for x in links} e = Event(name=name, session=self.session, when=when, location='unknown') e.add_source(url) for note, url in links.items(): e.add_link(note=note, url=url) yield e
def event_obj(): e = Event(name="get-together", when=dt.datetime.utcnow(), location="Joe's Place") e.add_source(url='foobar') e.validate() return e
def scrape_event_page(self, event): url = event.attrib['href'] page = self.lxmlize(url) title = page.xpath("//h2[@class='evlist_header']") title = title[0].text.strip() if title else None if title is None: return if "CANCELED" in title: return info = page.xpath( "//div[@style='position:relative;margin-right:40px;']")[0] blocks = info.xpath(".//div") ret = {} for block in blocks: els = block.xpath("./*") if not els: continue le = els[0] if le.tag != 'label': continue label, div = els ltex = label.text_content().strip() dtex = div.text_content().strip() ret[ltex] = dtex when = dt.datetime.utcnow() date, start, end = (x.strip() for x in ret['When:'].split("\n")) start = re.sub("^@", "", start).strip() end = end.replace("-", "").strip() replace = [ ('Apr', 'April'), ] skip = ["Occurs every"] for k, v in replace: date = date.replace(k, v).strip() if True in (x in end for x in skip): return start = "%s %s" % (date, start) end = "%s %s" % (date, end) start, end = (dt.datetime.strptime(x, "%B %d, %Y %I:%M %p") for x in (start, end)) event = Event(session=self.session, name=title, location=ret['Where:'], when=start, end=end) event.add_source(url) yield event
def scrape_event_page(self, event): url = event.attrib['href'] page = self.lxmlize(url) title = page.xpath("//h2[@class='evlist_header']") title = title[0].text.strip() if title else None if title is None: return if "CANCELED" in title: return info = page.xpath("//div[@style='position:relative;margin-right:40px;']")[0] blocks = info.xpath(".//div") ret = {} for block in blocks: els = block.xpath("./*") if not els: continue le = els[0] if le.tag != 'label': continue label, div = els ltex = label.text_content().strip() dtex = div.text_content().strip() ret[ltex] = dtex when = dt.datetime.utcnow() date, start, end = (x.strip() for x in ret['When:'].split("\n")) start = re.sub("^@", "", start).strip() end = end.replace("-", "").strip() replace = [ ('Apr', 'April'), ] skip = ["Occurs every"] for k, v in replace: date = date.replace(k, v).strip() if True in (x in end for x in skip): return start = "%s %s" % (date, start) end = "%s %s" % (date, end) start, end = (dt.datetime.strptime(x, "%B %d, %Y %I:%M %p") for x in (start, end)) event = Event( session=self.session, name=title, location=ret['Where:'], when=start, end=end) event.add_source(url) yield event
def get_events(self): if self.session != self.get_current_session(): raise Exception("Can't do that, dude") curdate = None page = self.lxmlize(CAL_PAGE) for el in page.xpath("//div[@id='Section1']/*"): if el.tag[0] == 'h': when = WHEN.findall(el.text_content()) when = when[0] if when else None if when is None: continue curdate = " ".join(when) if (el.tag == 'p'): # and el.attrib.get('class') == 'MsoNormal'): els = el.xpath("./*") agenda = el.xpath(".//a[contains(@href, 'Archive.aspx')]") agenda = agenda[0] if agenda else None if agenda is None: continue info = self.cleanup(el.text_content()) when = DT.findall(info) when = when[0] if when else None if when is None: continue people = el.xpath(".//personname") places = el.xpath(".//place") time, ampm = when if curdate is None: self.warning("Can't scrape, since I don't know what date it is") continue tbuf = " ".join([curdate, time, ampm]) obj = dt.datetime.strptime(tbuf, "%B %d %Y %I:%M %p") try: _, where = info.rsplit(u"–", 1) except ValueError: continue where = where.replace(u" ", " ") where = re.sub("\s+", " ", where).strip() where = re.sub("agenda$", "", where).strip() event = Event(name=info, session=self.session, when=obj, location=where) event.add_source(CAL_PAGE) yield event
def test_basic_agenda(): e = Event(name="get-together", when=dt.datetime.utcnow(), location="Joe's Place") e.add_source(url='foobar') e.validate() agenda = e.add_agenda_item("foo bar") assert agenda e.validate()
def get_events(self): if self.session != self.get_current_session(): raise Exception("Can't do that, dude") start = dt.datetime.utcnow() start = start - dt.timedelta(days=10) end = start + dt.timedelta(days=30) url = URL.format(**{"from": start.strftime("%Y/%m/%d"), "til": end.strftime("%Y/%m/%d")}) page = self.lxmlize(url) events = page.xpath("//ul[contains(@class, 'committee-events')]//li") for event in events: string = event.text_content() po = CLICK_INFO.match(event.xpath(".//span")[0].attrib['onclick']) if po is None: continue poid = po.groupdict()['info_id'] # This is used to get more deetz on popage = self.popOverUrl(poid) when = dt.datetime.strptime(popage.xpath("//strong")[0].text, "%B %d, %Y @ %I:%M %p") who = popage.xpath("//h1")[0].text related = [] for item in popage.xpath("//div"): t = item.text if t is None: continue t = t.strip() for related_entity in ORD_INFO.findall(t): related.append({ "ord_no": related_entity, "what": t }) e = Event(name=who, session=self.session, when=when, location='unknown') e.add_source(url) for o in related: i = e.add_agenda_item(o['what']) i.add_bill(o['ord_no'], note='consideration') yield e
def test_basic_event(): """ test that we can create an event """ e = Event(name="get-together", when=dt.datetime.utcnow(), location="Joe's Place") e.add_source(url='foobar') e.validate() e.add_link("http://foobar.baz") e.add_link("http://foobar.baz", note="foo") e.validate() assert len(e.links) == 2
def get_events(self): if self.session != self.get_current_session(): raise Exception("Can't do that, dude") url = "http://chicago.legistar.com/Calendar.aspx/" page = self.lxmlize(url) main = page.xpath("//table[@class='rgMasterTable']")[0] rows = main.xpath(".//tr")[1:] for row in rows: if "No records were found." in row.text_content(): self.warning("Hum. They don't seem to have events?") continue (name, date, _, time, where, details, notice, agenda, summary, video) = row.xpath(".//td") # _ nom's the image next to the date on the page. name = name.text_content().strip() # leaving an href on the table time = time.text_content().strip() location = where.text_content().strip() if "Deferred" in time: continue all_day = False if time == "": all_day = True when = dt.datetime.strptime(date.text.strip(), "%m/%d/%Y") else: when = dt.datetime.strptime("%s %s" % (date.text.strip(), time), "%m/%d/%Y %I:%M %p") event = Event(name=name, session=self.session, when=when, location=location) event.add_source(url) agendas = agenda.xpath(".//a[@href]") for a in agendas: event.add_link(a.text, a.attrib['href']) summary = summary.xpath(".//a[@href]") for minute in summary: event.add_link(minute.text, minute.attrib['href']) yield event
def get_events(self): if self.session != self.get_current_session(): raise Exception("Can't do that, dude") url = "http://legistar.council.nyc.gov/Calendar.aspx" page = self.lxmlize(url) main = page.xpath("//table[@class='rgMasterTable']")[0] rows = main.xpath(".//tr")[1:] for row in rows: els = row.xpath(".//td") if len(els) <= 2: continue # Odd one-off. (name, date, _, time, where, topic, details, agenda, minutes, media) = els # _ nom's the image of the cal next to the meeting date. name = name.text_content().strip() # leaving an href on the table time = time.text_content().strip() location = where.text_content().strip() topic = topic.text_content().strip() if "Deferred" in time: continue all_day = False if time == "": all_day = True when = dt.datetime.strptime(date.text.strip(), "%m/%d/%Y") else: when = dt.datetime.strptime("%s %s" % (date.text.strip(), time), "%m/%d/%Y %I:%M %p") event = Event(name=name, session=self.session, when=when, location=location) event.add_source(url) details = details.xpath(".//a[@href]") for detail in details: event.add_document(detail.text, detail.attrib["href"], mimetype="text/html") agendas = agenda.xpath(".//a[@href]") for a in agendas: event.add_document(a.text, a.attrib["href"], mimetype="application/pdf") minutes = minutes.xpath(".//a[@href]") for minute in minutes: event.add_document(minute.text, minute.attrib["href"], mimetype="application/pdf") yield event
def scrape_event(self, href): page = self.lxmlize(href.attrib['href']) what = page.xpath("//td[@id='ctl14_ctl16_tdTitleCell']")[0].text info = page.xpath("//div[@id='ctl14_pnlEvent']//table//table//tr")[1:] ret = { "Location:": "Unknown" } for tr in info: tds = tr.xpath(".//td") if len(tds) < 2: continue what, data = [tds.pop(0).text_content().strip() for x in range(2)] ret[what] = data agendas = page.xpath("//a[contains(@title, 'Meeting Agenda')]") if agendas: for agenda in agendas: print("Agenda:", agenda.attrib['href']) t = ret['Time:'] start_time, end_time = t, None if "-" in t: start_time, end_time = (x.strip() for x in t.split("-", 1)) start_time = "%s %s" % (ret['Date:'], start_time) dts = "%B %d, %Y %I:%M %p" start = dt.datetime.strptime(start_time, dts) end = None if end_time: end = "%s %s" % (ret['Date:'], end_time) end = dt.datetime.strptime(end, dts) kwargs = {} if end: kwargs['end'] = end e = Event(name=what, session=self.session, location=ret['Location:'], when=start, **kwargs) e.add_source(href.attrib['href']) yield e
def get_events(self): # get list of executive orders url = 'http://www.governor.ny.gov/sl2/ExecutiveOrderindex' page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) # extract governor's name gov = page.xpath("(//div[@class='section-header']/div/div/div/a/div/h2)[1]")[0] governor_name = gov.text.lstrip('Governor ') # scrape each executive order for eo_par in page.xpath("//div[@class='content']/p"): for link in eo_par.xpath(".//a"): url = link.get('href').lower() if url.endswith('.pdf'): continue # get date for executive order eo_page = self.urlopen(url) eo_page = lxml.html.fromstring(eo_page) eo_page = re.sub('(\\r*\\n|\W)', ' ', eo_page.xpath('string()').lower()) eo_page = re.sub('\s+', ' ', eo_page) date_par = re.search('(?:g i v e n)(.*)(?:by the governor)', eo_page).groups()[0] date_comp = [s.strip() for s in re.match('(?:.*this)(.*)(?:day of)(.*)(?:in the year)(.*)', date_par).groups()] eo_date = dt.datetime.strptime(' '.join( (str(Wtn.parse(date_comp[0])), date_comp[1], str(Wtn.parse(date_comp[2])))), '%d %B %Y') # build yield object eo_number = eo_par.xpath('string()').split(':', 1)[0] eo = Event(eo_number, eo_date, 'New York') eo.add_person(governor_name, 'governor') eo.description = link.text eo.add_document(eo_number, url, 'text/html') eo.add_source(url) yield eo # TODO: get list of press statements
def get_events(self): for page in self.eventPages(EVENTSPAGE) : events_table = page.xpath("//table[@class='rgMasterTable']")[0] for events, headers, rows in self.parseDataTable(events_table) : print(events) location_string = events[u'Meeting\xa0Location'] location_list = location_string.split('--') location = ', '.join(location_list[0:2]) status_string = location_list[-1].split('Chicago, Illinois') if len(status_string) > 1 and status_string[1] : status = status_string[1].lower() if status not in ['cancelled', 'tentative', 'confirmed', 'passed'] : print(status) status = 'confirmed' else : status = 'confirmed' when = events[u'Meeting\xa0Date'] time_string = events[u'Meeting\xa0Time'] event_time = datetime.datetime.strptime(time_string, "%I:%M %p") when = when.replace(hour=event_time.hour) e = Event(name=events["Name"]["label"], session=self.session, when=when, location=location, status=status) e.add_source(EVENTSPAGE) if events['Video'] != u'Not\xa0available' : print(events['Video']) yield e
def get_events(self): # get list of executive orders url = 'http://nj.gov/infobank/circular/eoindex.htm' page = self.urlopen(url) page = lxml_html.fromstring(page) page.make_links_absolute(url) # state variables for parser governor_name = None gov_session_name = None # parse the table of executive orders for eo_row in page.xpath('//table[@border>0]//tr'): cols = eo_row.xpath('.//td') # extract governor's name if len(cols) == 1: # remove things like "'s" governor_name = re.sub('\W\w\s', ' ', eo_row.xpath('string()')) governor_name = re.sub('\\r*\\n|\W', ' ', governor_name) governor_name = re.sub('\s+', ' ', governor_name) governor_name = re.search("executive order.*governor(.*)administration", governor_name, re.IGNORECASE).groups()[0].strip() gov_session_name = re.sub('\s+', '_', governor_name) # extract executive order elif len(cols) == 3: if self.session == gov_session_name: eo_num = cols[0].xpath('string()').strip() try: float(eo_num) except ValueError: continue eo_title = re.sub('\\r*\\n', ' ', cols[1].xpath('string()')) eo_title = re.sub('\s+', ' ', eo_title) eo_title = re.sub('\[.*pdf.*\]', '', eo_title).strip() if eo_title == '' or eo_title is None: continue eo_date = re.search('([0-9]{1,2}).*/([0-9]{1,2}).*/([0-9]{4}|[0-9]{2})', cols[2].xpath('string()')) if eo_date is None: continue eo_date = '/'.join(eo_date.groups()) try: eo_date = dt.datetime.strptime(eo_date, '%m/%d/%y') except ValueError: eo_date = dt.datetime.strptime(eo_date, '%m/%d/%Y') eo_source = cols[0].xpath('.//a')[0].get('href').lower() mime_type = MimeTypes().guess_type(eo_source)[0] if mime_type is None: mime_type = 'text/html' # build yield object eo = Event(eo_num, eo_date, 'New Jersey', gov_session_name) eo.add_person(governor_name, 'governor') eo.description = eo_title eo.add_document(eo_num, eo_source, mime_type) eo.add_source(eo_source) yield eo
def get_events(self): "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport" "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport" # scrape attendance tmpdir = tempfile.mkdtemp() page = lxmlize( "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport" ) members = page.xpath( '//td[@class="inputText"]/select[@name="memberId"]/option') for member in members: post = { 'function': 'getMemberAttendanceReport', 'download': 'csv', 'exportPublishReportId': 1, 'termId': 4, 'memberId': member.attrib['value'], 'decisionBodyId': 0, } r = requests.post("http://app.toronto.ca/tmmis/getAdminReport.do", data=post) if r.headers['content-type'] != 'application/vnd.ms-excel': continue attendance_file = open(tmpdir + '/' + member.text + '.csv', 'w') attendance_file.write(r.text) attendance_file.close() # scrape events post = { 'function': 'getMeetingScheduleReport', 'download': 'csv', 'exportPublishReportId': 3, 'termId': 4, 'decisionBodyId': 0, } r = requests.post("http://app.toronto.ca/tmmis/getAdminReport.do", data=post) empty = [] meeting_file = open('meetings.csv', 'w') meeting_file.write(r.text) meeting_file.close() with open('meetings.csv', 'rb') as csvfile: csvfile = csv.reader(csvfile, delimiter=',') next(csvfile) committee = '' agenda_items = [] for row in csvfile: name = row[0] when = row[2] when = dt.datetime.strptime(when, "%Y-%m-%d") location = row[5] if name != committee: committee = name agenda_items = find_items(committee) e = Event(name=name, session=self.session, when=when, location=location) attendees = find_attendees(tmpdir, row) if len(attendees) == 0: empty.append(row) for attendee in find_attendees(tmpdir, row): e.add_person(attendee) e.add_source( "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport" ) for item in agenda_items: if item['date'].date() == when.date(): i = e.add_agenda_item(item['description']) i.add_committee(committee) i['order'] = item['order'] for link in item['links']: i.add_media_link(link['name'], link['url'], on_duplicate='ignore') if 'notes' in item: i['notes'] = [item['notes']] yield e shutil.rmtree(tmpdir) os.remove('meetings.csv')
def get_events(self): "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport" "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport" # scrape attendance tmpdir = tempfile.mkdtemp() page = lxmlize("http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport") members = page.xpath('//td[@class="inputText"]/select[@name="memberId"]/option') for member in members: post = { 'function': 'getMemberAttendanceReport', 'download': 'csv', 'exportPublishReportId': 1, 'termId': 4, 'memberId': member.attrib['value'], 'decisionBodyId': 0, } r = requests.post("http://app.toronto.ca/tmmis/getAdminReport.do", data=post) if r.headers['content-type'] != 'application/vnd.ms-excel': continue attendance_file = open(tmpdir + '/' + member.text + '.csv', 'w') attendance_file.write(r.text) attendance_file.close() # scrape events post = { 'function': 'getMeetingScheduleReport', 'download': 'csv', 'exportPublishReportId': 3, 'termId': 4, 'decisionBodyId': 0, } r = requests.post("http://app.toronto.ca/tmmis/getAdminReport.do", data=post) empty = [] meeting_file = open('meetings.csv', 'w') meeting_file.write(r.text) meeting_file.close() with open('meetings.csv', 'rb') as csvfile: csvfile = csv.reader(csvfile, delimiter=',') next(csvfile) committee = '' agenda_items = [] for row in csvfile: name = row[0] when = row[2] when = dt.datetime.strptime(when, "%Y-%m-%d") location = row[5] if name != committee: committee = name agenda_items = find_items(committee) e = Event(name=name, session=self.session, when=when, location=location ) attendees = find_attendees(tmpdir, row) if len(attendees) == 0: empty.append(row) for attendee in find_attendees(tmpdir, row): e.add_person(attendee) e.add_source("http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport") for item in agenda_items: if item['date'].date() == when.date(): i = e.add_agenda_item(item['description']) i.add_committee(committee) i['order'] = item['order'] for link in item['links']: i.add_media_link(link['name'], link['url'], on_duplicate='ignore') if 'notes' in item: i['notes'] = [item['notes']] yield e shutil.rmtree(tmpdir) os.remove('meetings.csv')
def get_events(self): meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE) meetings_lxml = lxml.html.fromstring(meetings_html) for meeting_type in ('archive', 'upcoming'): for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type): # attempt to map the cells across table types. # if the sizes mismatch, ignore this one (it's an "empty" message) try: cell_mapping = self._organize_cells(meeting_type, meeting.cssselect('td')) except: continue meeting_title = cell_mapping['title'].text meeting_date = datetime.datetime.fromtimestamp(int(cell_mapping['date'].cssselect('span')[0].text)) e = Event(name=meeting_title, when=meeting_date, session=self.session, location='unknown') e.add_source(self.ARLINGTON_MEETING_PAGE) # detect agenda url, if present meeting_agenda_url = None if len(cell_mapping['agenda'].cssselect('a'))>0: meeting_agenda_url = cell_mapping['agenda'].cssselect('a')[0].attrib.get('href') # follow the agenda URL and attempt to extract associated documents if meeting_agenda_url is not None: e.add_link(meeting_agenda_url) e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html') meeting_agenda_html = self.urlopen(meeting_agenda_url) meeting_agenda_lxml = lxml.html.fromstring(meeting_agenda_html) for link in meeting_agenda_lxml.cssselect('a'): link_url = link.attrib.get('href','') if not len(link_url): continue if 'metaviewer.php' in link_url.lower(): # NOTE: application/pdf is a guess, may not always be correct if link.text is not None: e.add_document(name=link.text, url=link_url, mimetype='application/pdf') # skip everything below here for the 'upcoming' table if meeting_type=='upcoming': continue # detect video # TODO: extract actual mp4 files video_cell = cell_mapping['video'].cssselect('a') if len(video_cell)>0: video_url_match = re.search(r"http://(.*?)'", video_cell[0].attrib.get('onclick','')) if video_url_match is not None: e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html') # detect audio audio_cell = cell_mapping['audio'].cssselect('a') if len(audio_cell)>0: e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg') # detect minutes minutes_cell = cell_mapping['minutes'].cssselect('a') if len(minutes_cell)>0: e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get('href', ''), mimetype='text/html') yield e
def migrate_events(self, state): spec = {} if state: spec['state'] = state for entry in self.billy_db.events.find(spec, timeout=False): e = Event( name=entry['description'], when=entry['when'], location=entry['location'], session=entry['session'], updated_at=entry['updated_at'], created_at=entry['created_at'], type=entry['type'], ) e.identifiers = [{'scheme': 'openstates', 'identifier': entry['_id']}] e._openstates_id = entry['_id'] if entry.get('+location_url'): e.add_location_url(entry['+location_url']) link = entry.get('link', entry.get("+link")) if link: e.add_link(link, 'link') blacklist = ["description", "when", "location", "session", "updated_at", "created_at", "end", "sources", "documents", "related_bills", "state", "+link", "link", "level", "participants", "country", "_all_ids", "type"] e.status = entry.get('status') typos = { "canceled": "cancelled" } if e.status in typos: e.status = typos[e.status] for key, value in entry.items(): if key in blacklist or not value or key.startswith("_"): continue e.extras[key] = value if entry.get('end'): end = entry['end'] try: end = dt.datetime.fromtimestamp(end) except TypeError: pass e.end = end for source in entry['sources']: e.add_source(url=source['url']) if e.sources == []: continue # XXX: print warning for document in entry.get('documents', []): e.add_document(name=document.get('name'), document_id=document.get('doc_id'), url=document['url'], mimetype=document.get( "mimetype", document.get( "+mimetype", "application/octet-stream"))) # Try to add the mimetype. If it fails, fall back to a generic # undeclared application/octet-stream. agenda = None for bill in entry.get('related_bills', []): if agenda is None: agenda = e.add_agenda_item( description="Bills up for Consideration" ) hcid = _hot_cache.get(bill.get('id', None), None) bid = bill['bill_id'] if bid is None: continue agenda.add_bill(bill=bid, id=hcid) for who in entry.get('participants', []): participant_type = who.get('participant_type', 'committee') # I've gone through the backlog of OpenStates data, they are # all committees of some sort. who_chamber = who.get('chamber') if who_chamber is None: for chamber in ["_chamber", "+chamber"]: f = who.get(chamber) if f: who_chamber = f break if who_chamber is None: # Freak of nature ... continue hcid = _hot_cache.get(who.get('id', None), None) e.add_participant( name=who['participant'], type={ "committee": "organization", "legislator": "person", "person": "person", }[participant_type], id=hcid, note=who['type'], chamber=who_chamber) self.save_object(e)
def get_events(self): page = self.lxmlize(PAGE) events = page.xpath("//div[@class='col-middle']//ul/li") when = None for event in events: h3 = event.xpath("./a/h2") h3 = h3[0] if h3 else None if h3 is not None: when = h3.text else: if when is None: self.warning("Ungrok!") continue b, _, i = event.xpath("./p/*") title = b.text_content() event = i.text_content() if "NO MEETING" in event: continue day, title = (x.strip() for x in title.split("-", 1)) where = "Council Chambers" for subevent in (x.strip() for x in event.split(";")): if " in " in subevent: subevent, where = subevent.rsplit(" in ", 1) subevent = subevent.replace(u'\xa0', ' ') if "NO" in subevent and "MEETING" in subevent: continue if "to follow" in subevent: continue info = EVENT_RE.match(subevent).groupdict() event, time = [info[x] for x in ['event', 'time']] ampm = { "a.m.": "AM", "p.m.": "PM", } for old, new in ampm.items(): time = time.replace(old, new) dtstring = ", ".join([day, time]) try: etime = dt.datetime.strptime( dtstring, "%m/%d/%Y, %I:%M %p") except ValueError: etime = dt.datetime.strptime( dtstring, "%m/%d/%Y, %I%p") e = Event(name=event, when=etime, location=where) e.add_source(PAGE) yield e