예제 #1
0
    def parse_table_row(row, index):
        expected_column_count = 6
        columns = row.find_all('td')

        if len(columns) != expected_column_count:
            raise errors.UnexpectedElementPropertyError(
                f'Expected roster row[{index}] to have {expected_column_count} '
                f'columns but found {len(columns)}.')

        _, email_col, name_col, id_col, _, _ = columns

        # get email
        email = email_col.find('input', type='image')
        if email is None:
            raise errors.MissingElementError(f'Roster row[{index}] has no email column.')
        email = email.get('title')

        # get name
        name = name_col.find('a')
        if name is None:
            raise errors.MissingElementError(f'Roster row[{index}] has no name column.')
        name = name.get_text()

        # get student id
        user_id = id_col.get_text()
        if not user_id:
            raise errors.UnexpectedElementPropertyError(
                f'Roster row[{index}] has no value associated with the user id column.')

        # get href
        nav_element = name_col.find('a')
        if nav_element is None:
            raise errors.MissingElementError(f'Roster row[{index}] has no navigation element.')

        return email, name, user_id, nav_element
예제 #2
0
    def fetch_value(entry):
        """Returns the text value of an element's inner <td> child.

        :param entry: a table element with an inner <td> child
        :return: the value associated with the element's inner <td> child
        :raises MissingElementError: if the element does not contain a <td> tag
        """

        name_element = entry.find('th')
        value_element = entry.find('td')

        if name_element is None:  # raise an exception if no name element was found
            raise errors.MissingElementError(
                'Overview table row contains no name element <th></th>')

        if value_element is None:  # raise an exception if no value element was found
            raise errors.MissingElementError(
                'Overview table row contains no value element <td></td>')

        name = name_element.get_text(strip=True)\
            .replace(u'\xa0', '').lower()\
            .replace(' ', '_').replace(':', '')
        value = value_element.get_text(separator=' ', strip=True)\
            .replace(u'\xa0', '')

        return name, value
예제 #3
0
    def get_all_student_rows(self):
        thread_num = self.thread_num
        num_threads = self.num_threads

        table = self.dc.html.find('tbody', class_='gbody')
        if table is None:
            unique_filename = str(uuid.uuid4()) + '.html'
            with open(unique_filename, 'w+') as file:
                file.write(str(self.dc.html))
            raise errors.MissingElementError(
                f'Roster table is missing. (state in {unique_filename})')

        rows = table.find_all('tr')
        row_index = 0
        results = []

        for row in rows:
            if row_index % num_threads == thread_num:
                try:
                    email, name, user_id, nav_element = self.parse_table_row(
                        row, row_index)
                    results.append((email, name, user_id, nav_element))
                except errors.ScraperError:
                    traceback.print_exc()
            row_index += 1

        return results
예제 #4
0
    def find_rows_to_parse(self):
        """ Returns a list of HTML elements of rows this thread is responsible for. """

        row_index = 0
        thread_num = self.thread_num
        num_threads = self.num_threads
        return_value = []

        # find the table that contains the rows
        table = self.dc.html.find('tbody', {'class': 'gbody'})
        if table is None:
            raise errors.MissingElementError('Table element not found.')

        # find rows of the table
        table_rows = table.find_all('tr')

        # determine which rows we are responsible for
        for table_row in table_rows:

            # MyGCC includes hidden rows between each course displayed in the
            #   table. These rows are irrelevant and need to be skipped. This
            #   also checks whether or not the task has been manually aborted.
            if 'subItem' in table_row.get('class', ''):
                continue

            # determines if this thread is responsible for this row index
            should_handle = row_index % num_threads == thread_num
            if should_handle:
                return_value.append(table_row)
            row_index += 1

        return return_value
예제 #5
0
    def get_table(self, table_id):
        """Finds and returns a table with the specified element id.

        :param table_id: the unique identifier of the table to find
        :return: the table element if found
        :raises MissingElementError: if the table could not be found
        """

        table = self.html.find('table', {'id': table_id})
        if table is None:
            raise errors.MissingElementError(f'Table {table_id} not found.')
        return table
예제 #6
0
    def get_all_student_rows(self):
        scraper = self.scraper

        table = scraper.html.find('tbody', class_='gbody')
        if table is None:
            raise errors.MissingElementError('Roster table is missing.')

        rows = table.find_all('tr')
        row_index = 0
        results = []

        for row in rows:
            try:
                email, name, user_id, nav_element = self.parse_table_row(row, row_index)
                results.append((email, name, user_id, nav_element))
            except errors.ScraperError:
                traceback.print_exc()
            row_index += 1

        return results
예제 #7
0
    def course_row_to_course(self, row):

        # find the navigation element
        nav_element = row.find('a')
        if nav_element is None:
            raise errors.MissingElementError(
                'Course row navigation element missing.')

        # navigate to the course overview page
        action, payload = self.dc.prepare_payload(nav_element=nav_element)
        post_url = self.dc.BASE_URL + action
        self.dc.http_post(post_url, data=payload)

        # course code with section letters removed
        course_code = ' '.join(nav_element.text.strip().split(' ')[:2])

        # fetch data from this page
        details = self.dc.html.find('div', {'id': 'pg0_V_divCourseDetails'})

        if details is None:  # abort course parsing if the details element is missing
            self.nav_to_courses_from_course(
            )  # ensure we return to the courses list
            raise errors.MissingElementError(
                f'details element could not be found for {course_code}')

        # fetch course title
        title_elem = details.find('b')
        if title_elem is None:  # abort course parsing if the title element is missing
            self.nav_to_courses_from_course(
            )  # ensure we return to the courses list
            raise errors.MissingElementError(
                f'title element missing for {course_code}')
        course_title = title_elem.text.split('(', 1)[0].strip()

        # fetch course term
        term_elem = details.find('span', {'id': 'pg0_V_lblTermDescValue'})
        if term_elem is None:  # abort course parsing if the term element is missing
            self.nav_to_courses_from_course(
            )  # ensure we return to the courses list
            raise errors.MissingElementError(
                f'term element missing for {course_code}')
        course_term = term_elem.text.strip().strip(',')

        # fetch course hours
        cred_elem = details.find('span', {'id': 'pg0_V_lblCreditHoursValue'})
        if cred_elem is None:  # abort course parsing if the credit element is missing
            self.nav_to_courses_from_course(
            )  # ensure we return to the courses list
            raise errors.MissingElementError(
                f'credit element missing for {course_code}')
        course_credits = float(cred_elem.text.strip())

        # fetch course requisites
        course_requisites = []
        prereqlink = self.dc.html.find('a',
                                       {'id': 'pg0_V_lnkbCourseRequisites'})
        if prereqlink is not None:

            # navigate to the course requisites page
            action, payload = self.dc.prepare_payload(nav_element=prereqlink)
            post_url = self.dc.BASE_URL + action
            self.dc.http_post(post_url, data=payload)
            course_requisites = self.parse_course_requisites()

            self.nav_to_courses_from_requisites()
        else:
            self.nav_to_courses_from_course()

        # construct course class from data
        course = Course(course_code, course_title, course_term, course_credits,
                        course_requisites)
        return course