Пример #1
0
    def meeting(self, tr):
        """
        Parses a <tr> from section listings that contains meeting information.
        """
        rejects = (u'U', u'M', u'T', u'W', u'R', u'F', u'S', u'', u'X')
        tds = [x for x in tr.select('td') if x.text.strip() not in rejects]

        # Location
        loc_keys = ('campus', 'building', 'room_number')
        campus = tds[0].text.strip()
        if u'Online Course Information' in tds[1].text or u'TBA' in tds[1].text:
            building, room = None, None
        else:
            building = tds[1].contents[0].strip()
            room = tds[1].contents[2].strip()
        loc_values = [campus, building, room]
        location = utils.insert_keys_with_values({}, loc_keys, loc_values)

        # Days
        if u'TBA' in tds[2].text:
            days = None
        else:
            xdays = tds[2].select('table > td')
            day_abbrs = (u'U', u'M', u'T', u'W', u'R', u'F', u'S')
            days = ''
            for i, x in enumerate(xdays):
                if x.text.strip() == u'X':
                    days += day_abbrs[i]
            if days == '':
                raise ParserError("No days found in Meeting <tr>")

        # Times and Schedule Type
        import datetime

        if u'TBA' in tds[3].text.strip():
            start_time, end_time = None, None
            schedule_type = None
        else:
            timestr = tds[3].contents[0].strip()
            m = re.match(r'([0-9]+):([0-9]+)\s+([a|p]m)\s+-\s+([0-9]+):([0-9]+)\s+([a|p]m)', timestr)
            if not m:
                raise ParserError("Failed to parse meeting time ('%s')" % timestr)
            g = m.groups()
            s_hr, e_hr = int(g[0]), int(g[3])
            if 'pm' in g[2].lower() and s_hr != 12:
                s_hr += 12
            if 'pm' in g[5].lower() and e_hr != 12:
                e_hr += 12
            start_time = datetime.time(s_hr, int(g[1]))
            end_time = datetime.time(e_hr, int(g[4]))

            if len(tds[3].contents) < 3:
                schedule_type = None
            else:
                schedule_type = tds[3].contents[2]

        # Start/End days
        import calendar

        def parse_date(x):
            g = re.match(r'([A-z]+)\s+([0-9]+),\s+([0-9]+)', x).groups()
            month = g[0]
            for i, x in enumerate(calendar.month_abbr):
                if month in x:
                    month = i
                    break
            return datetime.date(month=month, day=int(g[1]), year=int(g[2]))

        start_date = parse_date(tds[4].text)
        end_date = parse_date(tds[5].text)

        # Instructors
        inst_tag = tds[6]
        instructors = []
        for a in inst_tag.find_all('a'):
            name = a.attrs['target'].rsplit(' ', 1)
            first = name[0]
            last = name[1]
            instructors.append({'first': first, 'last': last, 'email': a.attrs['href'].replace('mailto:', '')})

        m_keys = ('location', 'days', 'start_time', 'end_time',
                  'schedule_type', 'start_date', 'end_date', 'instructors')
        m_values = [location, days, start_time, end_time, schedule_type,
                    start_date, end_date, instructors]
        meeting = utils.insert_keys_with_values({}, m_keys, m_values)
        return meeting
Пример #2
0
    def sections(self, sections_html, limit=None):
        t0 = time.time()
        # BeautifulSoup it
        bs = BeautifulSoup(sections_html)
        sections_table = bs.find('table', attrs={'class': 'datadisplaytable',
                                                 'summary': 'This layout table is used to present the sections found'})
        t1 = time.time()
        logger.debug('Found section listings table (%s seconds)' % (t1 - t0))

        # Select all direct <tr> children
        st_rows = sections_table.find_all('tr', recursive=False)
        t2 = time.time()
        logger.debug('Isolated <tr> tags (%s seconds)' % (t2 - t1))

        # Iterate over tags and sequentially extract data
        ri = iter(st_rows)
        sections = []
        cur_section = {}
        while True:
            try:
                next = ri.next()
                # Finish iterating if the limit has been reached
                if limit and len(sections) >= limit - 1 and utils.has_empty_dddefault(next):
                    raise StopIteration
            except StopIteration:
                if len(cur_section) > 0:
                    if not 'meetings' in cur_section.keys(): cur_section['meetings'] = []
                    if not 'comments' in cur_section.keys(): cur_section['comments'] = []
                    if not 'fees' in cur_section.keys(): cur_section['fees'] = []
                    sections.append(cur_section)
                    s = cur_section
                    msg = 'Parsed LAST section: %s%s/%s - %s (%s Meetings)' % (s['subject_code'], s['course_number'],s['section_number'], s['title'], len(s['meetings']))
                    logger.debug(msg)
                break

            # Section listings are delimited by two empty <td> tags
            #if utils.has_empty_dddefault(next):
            #    next = ri.next()

            # Section header
            if next.select('th.ddlabel a'):
                # Section header denotes the beginning of a new section
                # Load any previously scraped sections, and reset cur_section
                if len(cur_section) > 0:
                    if not 'meetings' in cur_section.keys():
                        cur_section['meetings'] = []
                    if not 'comments' in cur_section.keys():
                        cur_section['comments'] = []
                    if not 'fees' in cur_section.keys():
                        cur_section['fees'] = []
                    sections.append(cur_section)
                    s = cur_section
                    msg = 'Parsed section: %s%s/%s - %s (%s Meetings)' % (
                        s['subject_code'], s['course_number'],
                        s['section_number'], s['title'],
                        len(s['meetings']))
                    logger.debug(msg)
                    cur_section = {}
                else:
                    logger.warning("'cur_section' was empty when attempting to add it to 'sections'.")

                t = next.find('a').text
                m = re.match(r'([A-Z]+)\s+([A-Z0-9]+)/([A-Z0-9]+)\s+-\s+(.+)', t)
                if not m:
                    raise ParserError("Could not parse section header ('%s')", t)
                keys = ('subject_code', 'course_number', 'section_number', 'title')
                cur_section = utils.insert_keys_with_values(cur_section, keys, m.groups())

            # Section-specific info
            if next.find('table', {'summary': 'This layout table is used to present the seating numbers.'}):
                keys = (
                    'crn', 'credit_hours', 'term_type', 'capacity', 'enrolled', 'waitlist_capacity', 'waitlist_count')
                values = [x.text.strip() for x in next.select('table tr td')]
                del values[-4] # Delete 'Seats Available'
                del values[-1] # Delete 'Waitlist Availability'
                cur_section = utils.insert_keys_with_values(cur_section, keys, values)

            # Meetings
            if next.find('table',
                         {'summary': 'This table lists the scheduled meeting times and assigned instructors for this class..'}):
                # Parse the <tr>s that contain data
                for x in [x for x in next.find('table').find_all('tr', recursive=False) if not x.find('th')]:
                    meeting = self.meeting(x)
                    if cur_section.has_key('meetings'):
                        if isinstance(cur_section['meetings'], list):
                            cur_section['meetings'].append(meeting)
                        else:
                            raise ParserError(
                                "cur_section['meetings'] is a %s, but it should be a list." % type(
                                    cur_section['meetings']))
                    else:
                        cur_section['meetings'] = [meeting]

            # Fees
            if next.find('table',
                         {'summary': 'This layout table is used to present the fee amounts and descriptions.'}):
                # Select only Description and Amount
                fee_tds = [x.text.strip() for x in next.select('table tr td')]

                from decimal import Decimal
                # Break into chunks of 3 and parse
                fees = []
                for f in [fee_tds[i:i + 3] for i in range(0, len(fee_tds), 3)]:
                    keys = ('description', 'amount')
                    values = [f[1], str(Decimal(f[2]))]
                    fee = utils.insert_keys_with_values({}, keys, values)
                    fees.append(fee)
                cur_section['fees'] = fees

            # Comments
            if next.find('table', {'summary': 'This layout table is used to present comments.'}):
                comment_tds = next.select('table > tr > td')
                if len(comment_tds) > 0:
                    comments = [x.text.replace('Comments:', '').strip() for x in comment_tds if
                                'See Course Highlights' not in x.text]
                    cur_section['comments'] = comments

                    # Ignoring Attributes (should be already parsed with courses)

        logger.debug('Parsed %s sections (%s seconds)' % (len(sections), time.time() - t2))
        from pprint import pprint
        return sections