def extract_time_range(self, unformatted_time_range): if unformatted_time_range == 'TBA' or unformatted_time_range == '': raise ParseJump(self.ingestor['course_code'] + ' time TBA') search = re.match(r'(.*) \- (.*)', unformatted_time_range) if search is None: raise ParseJump('time not found on page') def ampm(x): return x.replace('a', 'am').replace('p', 'pm') self.ingestor['time_start'] = ampm(search.group(1)) self.ingestor['time_end'] = ampm(search.group(2))
def extract_time_range(self, unformatted_time_range): if unformatted_time_range == "TBA" or unformatted_time_range == "": raise ParseJump(self.ingestor["course_code"] + " time TBA") search = re.match(r"(.*) \- (.*)", unformatted_time_range) if search is None: raise ParseJump("time not found on page") def ampm(x): return x.replace("a", "am").replace("p", "pm") self.ingestor["time_start"] = ampm(search.group(1)) self.ingestor["time_end"] = ampm(search.group(2))
def extract_days(self, unformatted_days): if unformatted_days == 'TBA' or unformatted_days == '': raise ParseJump(self.ingestor['course_code'] + ' days TBA') self.ingestor['days'] = list(unformatted_days)
def parse_course(self, soup): # remove cancelled classes if soup.find('a', class_='cancelledStatus'): raise ParseJump('cancelled course') # Extract course code and term number to generate access to more info details = soup.find('td', class_='classSection')['onclick'] # Extract course number and term code search = re.search( r"showClassDetailPanel.fire\({classNumber : '([0-9]*)', termCode : '([0-9]*)',", details) course_number = search.group(1) soup = self.requester.get(Parser.URL + '/GetClassSectionDetail.action', params={ 'classNumber': course_number, 'termCode': search.group(2) }) # Extract course name and abbreviation details search = re.search( r'(.*):.*\n(.*)', soup.find(id='classSectionDetailDialog').find('h1').text) abbr = search.group(1) # Extract department code, catalog ID, and section number from abbr title = re.match(r'(\S*)-(\S*)-(\S*)', abbr) if not title: raise ParseJump('no title in course') self.ingestor['course_name'] = search.group(2) self.ingestor['course_code'] = title.group(1) + '-' + title.group(2) self.ingestor['section_code'] = '(' + title.group(3).strip() + ')' # Deal with course details as subgroups seen on details page detail_headers = soup.find_all('div', class_='detailHeader') detail_panels = soup.find_all('div', class_='detailPanel') if len(detail_headers) != len(detail_panels): raise ParseError('there should be equal detail headers and panels') for i in range(len(detail_headers)): # Extract header name header = detail_headers[i].text.strip() # Choose parsing strategy dependent on header if header == "Details" or header == "Availability": self.parse_labeled_table(detail_panels[i]) elif header == "Description": self.extract_description(detail_panels[i]) elif header == "Notes": self.extract_notes(detail_panels[i]) elif header == "Meeting Times": self.parse_meeting_times(detail_panels[i]) elif header == "Cross Listings": pass elif header == "Attributes": self.parse_attributes(detail_panels[i]) elif header == "Ad Hoc Meeting Times": pass course = self.ingestor.ingest_course() self.ingestor.ingest_section(course) self.ingestor['meetings'] = [] return course_number
def extract_days(self, unformatted_days): if unformatted_days == "TBA" or unformatted_days == "": raise ParseJump(self.ingestor["course_code"] + " days TBA") self.ingestor["days"] = list(unformatted_days)
def parse_course(self, soup): # remove cancelled classes if soup.find("a", class_="cancelledStatus"): raise ParseJump("cancelled course") # Extract course code and term number to generate access to more info details = soup.find("td", class_="classSection")["onclick"] # Extract course number and term code search = re.search( r"showClassDetailPanel.fire\({classNumber : '([0-9]*)', termCode : '([0-9]*)',", details, ) course_number = search.group(1) soup = self.requester.get( Parser.URL + "/GetClassSectionDetail.action", params={ "classNumber": course_number, "termCode": search.group(2) }, ) # Extract course name and abbreviation details search = re.search( r"(.*):.*\n(.*)", soup.find(id="classSectionDetailDialog").find("h1").text) abbr = search.group(1) # Extract department code, catalog ID, and section number from abbr title = re.match(r"(\S*)-(\S*)-(\S*)", abbr) if not title: raise ParseJump("no title in course") self.ingestor["course_name"] = search.group(2) self.ingestor["course_code"] = title.group(1) + "-" + title.group(2) self.ingestor["section_code"] = "(" + title.group(3).strip() + ")" # Deal with course details as subgroups seen on details page detail_headers = soup.find_all("div", class_="detailHeader") detail_panels = soup.find_all("div", class_="detailPanel") if len(detail_headers) != len(detail_panels): raise ParseError("there should be equal detail headers and panels") for i in range(len(detail_headers)): # Extract header name header = detail_headers[i].text.strip() # Choose parsing strategy dependent on header if header == "Details" or header == "Availability": self.parse_labeled_table(detail_panels[i]) elif header == "Description": self.extract_description(detail_panels[i]) elif header == "Notes": self.extract_notes(detail_panels[i]) elif header == "Meeting Times": self.parse_meeting_times(detail_panels[i]) elif header == "Cross Listings": pass elif header == "Attributes": self.parse_attributes(detail_panels[i]) elif header == "Ad Hoc Meeting Times": pass course = self.ingestor.ingest_course() self.ingestor.ingest_section(course) self.ingestor["meetings"] = [] return course_number