def parse_table_row(row, index): expected_column_count = 6 columns = row.find_all('td') if len(columns) != expected_column_count: raise errors.UnexpectedElementPropertyError( f'Expected roster row[{index}] to have {expected_column_count} ' f'columns but found {len(columns)}.') _, email_col, name_col, id_col, _, _ = columns # get email email = email_col.find('input', type='image') if email is None: raise errors.MissingElementError(f'Roster row[{index}] has no email column.') email = email.get('title') # get name name = name_col.find('a') if name is None: raise errors.MissingElementError(f'Roster row[{index}] has no name column.') name = name.get_text() # get student id user_id = id_col.get_text() if not user_id: raise errors.UnexpectedElementPropertyError( f'Roster row[{index}] has no value associated with the user id column.') # get href nav_element = name_col.find('a') if nav_element is None: raise errors.MissingElementError(f'Roster row[{index}] has no navigation element.') return email, name, user_id, nav_element
def fetch_value(entry): """Returns the text value of an element's inner <td> child. :param entry: a table element with an inner <td> child :return: the value associated with the element's inner <td> child :raises MissingElementError: if the element does not contain a <td> tag """ name_element = entry.find('th') value_element = entry.find('td') if name_element is None: # raise an exception if no name element was found raise errors.MissingElementError( 'Overview table row contains no name element <th></th>') if value_element is None: # raise an exception if no value element was found raise errors.MissingElementError( 'Overview table row contains no value element <td></td>') name = name_element.get_text(strip=True)\ .replace(u'\xa0', '').lower()\ .replace(' ', '_').replace(':', '') value = value_element.get_text(separator=' ', strip=True)\ .replace(u'\xa0', '') return name, value
def get_all_student_rows(self): thread_num = self.thread_num num_threads = self.num_threads table = self.dc.html.find('tbody', class_='gbody') if table is None: unique_filename = str(uuid.uuid4()) + '.html' with open(unique_filename, 'w+') as file: file.write(str(self.dc.html)) raise errors.MissingElementError( f'Roster table is missing. (state in {unique_filename})') rows = table.find_all('tr') row_index = 0 results = [] for row in rows: if row_index % num_threads == thread_num: try: email, name, user_id, nav_element = self.parse_table_row( row, row_index) results.append((email, name, user_id, nav_element)) except errors.ScraperError: traceback.print_exc() row_index += 1 return results
def find_rows_to_parse(self): """ Returns a list of HTML elements of rows this thread is responsible for. """ row_index = 0 thread_num = self.thread_num num_threads = self.num_threads return_value = [] # find the table that contains the rows table = self.dc.html.find('tbody', {'class': 'gbody'}) if table is None: raise errors.MissingElementError('Table element not found.') # find rows of the table table_rows = table.find_all('tr') # determine which rows we are responsible for for table_row in table_rows: # MyGCC includes hidden rows between each course displayed in the # table. These rows are irrelevant and need to be skipped. This # also checks whether or not the task has been manually aborted. if 'subItem' in table_row.get('class', ''): continue # determines if this thread is responsible for this row index should_handle = row_index % num_threads == thread_num if should_handle: return_value.append(table_row) row_index += 1 return return_value
def get_table(self, table_id): """Finds and returns a table with the specified element id. :param table_id: the unique identifier of the table to find :return: the table element if found :raises MissingElementError: if the table could not be found """ table = self.html.find('table', {'id': table_id}) if table is None: raise errors.MissingElementError(f'Table {table_id} not found.') return table
def get_all_student_rows(self): scraper = self.scraper table = scraper.html.find('tbody', class_='gbody') if table is None: raise errors.MissingElementError('Roster table is missing.') rows = table.find_all('tr') row_index = 0 results = [] for row in rows: try: email, name, user_id, nav_element = self.parse_table_row(row, row_index) results.append((email, name, user_id, nav_element)) except errors.ScraperError: traceback.print_exc() row_index += 1 return results
def course_row_to_course(self, row): # find the navigation element nav_element = row.find('a') if nav_element is None: raise errors.MissingElementError( 'Course row navigation element missing.') # navigate to the course overview page action, payload = self.dc.prepare_payload(nav_element=nav_element) post_url = self.dc.BASE_URL + action self.dc.http_post(post_url, data=payload) # course code with section letters removed course_code = ' '.join(nav_element.text.strip().split(' ')[:2]) # fetch data from this page details = self.dc.html.find('div', {'id': 'pg0_V_divCourseDetails'}) if details is None: # abort course parsing if the details element is missing self.nav_to_courses_from_course( ) # ensure we return to the courses list raise errors.MissingElementError( f'details element could not be found for {course_code}') # fetch course title title_elem = details.find('b') if title_elem is None: # abort course parsing if the title element is missing self.nav_to_courses_from_course( ) # ensure we return to the courses list raise errors.MissingElementError( f'title element missing for {course_code}') course_title = title_elem.text.split('(', 1)[0].strip() # fetch course term term_elem = details.find('span', {'id': 'pg0_V_lblTermDescValue'}) if term_elem is None: # abort course parsing if the term element is missing self.nav_to_courses_from_course( ) # ensure we return to the courses list raise errors.MissingElementError( f'term element missing for {course_code}') course_term = term_elem.text.strip().strip(',') # fetch course hours cred_elem = details.find('span', {'id': 'pg0_V_lblCreditHoursValue'}) if cred_elem is None: # abort course parsing if the credit element is missing self.nav_to_courses_from_course( ) # ensure we return to the courses list raise errors.MissingElementError( f'credit element missing for {course_code}') course_credits = float(cred_elem.text.strip()) # fetch course requisites course_requisites = [] prereqlink = self.dc.html.find('a', {'id': 'pg0_V_lnkbCourseRequisites'}) if prereqlink is not None: # navigate to the course requisites page action, payload = self.dc.prepare_payload(nav_element=prereqlink) post_url = self.dc.BASE_URL + action self.dc.http_post(post_url, data=payload) course_requisites = self.parse_course_requisites() self.nav_to_courses_from_requisites() else: self.nav_to_courses_from_course() # construct course class from data course = Course(course_code, course_title, course_term, course_credits, course_requisites) return course