def meeting(self, tr): """ Parses a <tr> from section listings that contains meeting information. """ rejects = (u'U', u'M', u'T', u'W', u'R', u'F', u'S', u'', u'X') tds = [x for x in tr.select('td') if x.text.strip() not in rejects] # Location loc_keys = ('campus', 'building', 'room_number') campus = tds[0].text.strip() if u'Online Course Information' in tds[1].text or u'TBA' in tds[1].text: building, room = None, None else: building = tds[1].contents[0].strip() room = tds[1].contents[2].strip() loc_values = [campus, building, room] location = utils.insert_keys_with_values({}, loc_keys, loc_values) # Days if u'TBA' in tds[2].text: days = None else: xdays = tds[2].select('table > td') day_abbrs = (u'U', u'M', u'T', u'W', u'R', u'F', u'S') days = '' for i, x in enumerate(xdays): if x.text.strip() == u'X': days += day_abbrs[i] if days == '': raise ParserError("No days found in Meeting <tr>") # Times and Schedule Type import datetime if u'TBA' in tds[3].text.strip(): start_time, end_time = None, None schedule_type = None else: timestr = tds[3].contents[0].strip() m = re.match(r'([0-9]+):([0-9]+)\s+([a|p]m)\s+-\s+([0-9]+):([0-9]+)\s+([a|p]m)', timestr) if not m: raise ParserError("Failed to parse meeting time ('%s')" % timestr) g = m.groups() s_hr, e_hr = int(g[0]), int(g[3]) if 'pm' in g[2].lower() and s_hr != 12: s_hr += 12 if 'pm' in g[5].lower() and e_hr != 12: e_hr += 12 start_time = datetime.time(s_hr, int(g[1])) end_time = datetime.time(e_hr, int(g[4])) if len(tds[3].contents) < 3: schedule_type = None else: schedule_type = tds[3].contents[2] # Start/End days import calendar def parse_date(x): g = re.match(r'([A-z]+)\s+([0-9]+),\s+([0-9]+)', x).groups() month = g[0] for i, x in enumerate(calendar.month_abbr): if month in x: month = i break return datetime.date(month=month, day=int(g[1]), year=int(g[2])) start_date = parse_date(tds[4].text) end_date = parse_date(tds[5].text) # Instructors inst_tag = tds[6] instructors = [] for a in inst_tag.find_all('a'): name = a.attrs['target'].rsplit(' ', 1) first = name[0] last = name[1] instructors.append({'first': first, 'last': last, 'email': a.attrs['href'].replace('mailto:', '')}) m_keys = ('location', 'days', 'start_time', 'end_time', 'schedule_type', 'start_date', 'end_date', 'instructors') m_values = [location, days, start_time, end_time, schedule_type, start_date, end_date, instructors] meeting = utils.insert_keys_with_values({}, m_keys, m_values) return meeting
def sections(self, sections_html, limit=None): t0 = time.time() # BeautifulSoup it bs = BeautifulSoup(sections_html) sections_table = bs.find('table', attrs={'class': 'datadisplaytable', 'summary': 'This layout table is used to present the sections found'}) t1 = time.time() logger.debug('Found section listings table (%s seconds)' % (t1 - t0)) # Select all direct <tr> children st_rows = sections_table.find_all('tr', recursive=False) t2 = time.time() logger.debug('Isolated <tr> tags (%s seconds)' % (t2 - t1)) # Iterate over tags and sequentially extract data ri = iter(st_rows) sections = [] cur_section = {} while True: try: next = ri.next() # Finish iterating if the limit has been reached if limit and len(sections) >= limit - 1 and utils.has_empty_dddefault(next): raise StopIteration except StopIteration: if len(cur_section) > 0: if not 'meetings' in cur_section.keys(): cur_section['meetings'] = [] if not 'comments' in cur_section.keys(): cur_section['comments'] = [] if not 'fees' in cur_section.keys(): cur_section['fees'] = [] sections.append(cur_section) s = cur_section msg = 'Parsed LAST section: %s%s/%s - %s (%s Meetings)' % (s['subject_code'], s['course_number'],s['section_number'], s['title'], len(s['meetings'])) logger.debug(msg) break # Section listings are delimited by two empty <td> tags #if utils.has_empty_dddefault(next): # next = ri.next() # Section header if next.select('th.ddlabel a'): # Section header denotes the beginning of a new section # Load any previously scraped sections, and reset cur_section if len(cur_section) > 0: if not 'meetings' in cur_section.keys(): cur_section['meetings'] = [] if not 'comments' in cur_section.keys(): cur_section['comments'] = [] if not 'fees' in cur_section.keys(): cur_section['fees'] = [] sections.append(cur_section) s = cur_section msg = 'Parsed section: %s%s/%s - %s (%s Meetings)' % ( s['subject_code'], s['course_number'], s['section_number'], s['title'], len(s['meetings'])) logger.debug(msg) cur_section = {} else: logger.warning("'cur_section' was empty when attempting to add it to 'sections'.") t = next.find('a').text m = re.match(r'([A-Z]+)\s+([A-Z0-9]+)/([A-Z0-9]+)\s+-\s+(.+)', t) if not m: raise ParserError("Could not parse section header ('%s')", t) keys = ('subject_code', 'course_number', 'section_number', 'title') cur_section = utils.insert_keys_with_values(cur_section, keys, m.groups()) # Section-specific info if next.find('table', {'summary': 'This layout table is used to present the seating numbers.'}): keys = ( 'crn', 'credit_hours', 'term_type', 'capacity', 'enrolled', 'waitlist_capacity', 'waitlist_count') values = [x.text.strip() for x in next.select('table tr td')] del values[-4] # Delete 'Seats Available' del values[-1] # Delete 'Waitlist Availability' cur_section = utils.insert_keys_with_values(cur_section, keys, values) # Meetings if next.find('table', {'summary': 'This table lists the scheduled meeting times and assigned instructors for this class..'}): # Parse the <tr>s that contain data for x in [x for x in next.find('table').find_all('tr', recursive=False) if not x.find('th')]: meeting = self.meeting(x) if cur_section.has_key('meetings'): if isinstance(cur_section['meetings'], list): cur_section['meetings'].append(meeting) else: raise ParserError( "cur_section['meetings'] is a %s, but it should be a list." % type( cur_section['meetings'])) else: cur_section['meetings'] = [meeting] # Fees if next.find('table', {'summary': 'This layout table is used to present the fee amounts and descriptions.'}): # Select only Description and Amount fee_tds = [x.text.strip() for x in next.select('table tr td')] from decimal import Decimal # Break into chunks of 3 and parse fees = [] for f in [fee_tds[i:i + 3] for i in range(0, len(fee_tds), 3)]: keys = ('description', 'amount') values = [f[1], str(Decimal(f[2]))] fee = utils.insert_keys_with_values({}, keys, values) fees.append(fee) cur_section['fees'] = fees # Comments if next.find('table', {'summary': 'This layout table is used to present comments.'}): comment_tds = next.select('table > tr > td') if len(comment_tds) > 0: comments = [x.text.replace('Comments:', '').strip() for x in comment_tds if 'See Course Highlights' not in x.text] cur_section['comments'] = comments # Ignoring Attributes (should be already parsed with courses) logger.debug('Parsed %s sections (%s seconds)' % (len(sections), time.time() - t2)) from pprint import pprint return sections