def section_from_header_link(self, header_link, course, term): """Builds a section from the information in the header link, as well as the supplied course and term""" m = re.search('(\S+)-(\S+)\s+\((\S+)\)', header_link.get_text()) # Make a section type for the supplied type, first checking if there is one if cc.SectionType.objects.filter(abbreviation=m.group(2)).count() == 0: print('WARNING: No section type for abbreviation: "{0}"'.format(m.group(2))) section_type = e_or_n(cc.SectionType, abbreviation=m.group(2)) section_type.save() attrs = { 'solus_id': m.group(3), 'index_in_course': m.group(1), 'type': section_type, 'course': course, 'term': term, } # Make a base section from the supplied attributes section = e_or_n(cc.Section, **attrs) # Store the link action for visiting later in case of a deep scrape section.click_action = header_link['id'] return section
def build_timeslots(self, all_days, start_time_str, end_time_str): """ Returns a list of all the timeslots present in a combo like 'MoTuWeSaSu' """ if not all_days or all_days == "TBA": return None start_time = None end_time = None if start_time_str and start_time_str != "TBA": start_time = datetime.strptime(start_time_str, "%I:%M%p") if end_time_str and end_time_str != "TBA": end_time = datetime.strptime(end_time_str, "%I:%M%p") timeslots = [] #loop through all days, 2 characters at a time while len(all_days) > 0: day_abbr = all_days[-2:] all_days = all_days[:-2] weekday = e_or_n(cc.DayOfWeek, abbreviation=day_abbr) weekday.save() timeslot_attributes = {'day_of_week' : weekday, 'start_time' : start_time, 'end_time' : end_time} timeslot = e_or_n(cc.Timeslot, **timeslot_attributes) timeslot.save(was_scraped=True) timeslots.append(timeslot) return timeslots
def section_from_header_link(self, header_link, course, term): """Builds a section from the information in the header link, as well as the supplied course and term""" m = re.search('(\S+)-(\S+)\s+\((\S+)\)', header_link.get_text()) # Make a section type for the supplied type, first checking if there is one if cc.SectionType.objects.filter(abbreviation=m.group(2)).count() == 0: print('WARNING: No section type for abbreviation: "{0}"'.format( m.group(2))) section_type = e_or_n(cc.SectionType, abbreviation=m.group(2)) section_type.save() attrs = { 'solus_id': m.group(3), 'index_in_course': m.group(1), 'type': section_type, 'course': course, 'term': term, } # Make a base section from the supplied attributes section = e_or_n(cc.Section, **attrs) # Store the link action for visiting later in case of a deep scrape section.click_action = header_link['id'] return section
def terms_offered(self, ): """Returns the terms during which the current term is offered""" terms = [] term_dropdown = self.soup.find("select", {'id': 'DERIVED_SAA_CRS_TERM_ALT'}) for option in term_dropdown.find_all("option"): m = re.search('^([^\s]+) (.*)$', option.get_text()) dropdown_value = option['value'] year = m.group(1) season = m.group(2) # Check if this term is old enough to be skipped now = datetime.now() if int(year) < now.year or (int(year) == now.year and season == "Winter"): print ("--------Omitting outdated term: {season} - {year}".format(season=season, year=year)) continue # Otherwise, make a real season and term season = e_or_n(cc.Season, name=season) season.save(was_scraped=True) term = e_or_n(cc.Term, year=year, season=season) # Store the dropdown value so we can request this term later in scraping term.dropdown_value = dropdown_value terms.append(term) return terms
def add_attribute_pair(self, attr, value, course): """ Recieves and attribute name and value, and adds it to the course, converting it to a model instance first if necessary """ if attr in self.attribute_mappings: # Find the name of the attribute that this value will be assigned to in the model attribute_name = self.attribute_mappings[attr] # Check if we need to make an actual model. If not, it'll just be assigned as is (as a str, probably) if attr in self.attribute_class_mappings: cls = self.attribute_class_mappings[attr] value = e_or_n(cls, name=value) # This model will have to be saved if it's new value.save() if attr in self.many_attribute_mappings: # ie., requisities self.many_attribute_mappings[attr](self, value, course) # Add the attribute's value to the course setattr(course, attribute_name, value) else: raise Exception( 'Encountered unexpected course attribute with label: "{0}"'. format(attr))
def add_attribute_pair(self, attr, value, course): """ Recieves and attribute name and value, and adds it to the course, converting it to a model instance first if necessary """ if attr in self.attribute_mappings: # Find the name of the attribute that this value will be assigned to in the model attribute_name = self.attribute_mappings[attr] # Check if we need to make an actual model. If not, it'll just be assigned as is (as a str, probably) if attr in self.attribute_class_mappings: cls = self.attribute_class_mappings[attr] value = e_or_n(cls, name=value) # This model will have to be saved if it's new value.save() if attr in self.many_attribute_mappings: # ie., requisities self.many_attribute_mappings[attr](self, value, course) # Add the attribute's value to the course setattr(course, attribute_name, value) else: raise Exception('Encountered unexpected course attribute with label: "{0}"'.format(attr))
def current_course(self, subject): """Returns the course built from the current page""" # Gather the title and description to create a new course title, number = self.get_title() attributes = {'title': title, 'number': number, 'subject': subject} course = e_or_n(cc.Course, **attributes) self.add_info_table_attributes(course) return course
def terms_offered(self, ): """Returns the terms during which the current term is offered""" terms = [] term_dropdown = self.soup.find("select", {'id': 'DERIVED_SAA_CRS_TERM_ALT'}) for option in term_dropdown.find_all("option"): m = re.search('^([^\s]+) (.*)$', option.get_text()) dropdown_value = option['value'] year = m.group(1) season = m.group(2) # Check if this term is old enough to be skipped now = datetime.now() if int(year) < now.year or (int(year) == now.year and season == "Winter"): print( "--------Omitting outdated term: {season} - {year}".format( season=season, year=year)) continue # Otherwise, make a real season and term season = e_or_n(cc.Season, name=season) season.save(was_scraped=True) term = e_or_n(cc.Term, year=year, season=season) # Store the dropdown value so we can request this term later in scraping term.dropdown_value = dropdown_value terms.append(term) return terms
def build_timeslots(self, all_days, start_time_str, end_time_str): """ Returns a list of all the timeslots present in a combo like 'MoTuWeSaSu' """ if not all_days or all_days == "TBA": return None start_time = None end_time = None if start_time_str and start_time_str != "TBA": start_time = datetime.strptime(start_time_str, "%I:%M%p") if end_time_str and end_time_str != "TBA": end_time = datetime.strptime(end_time_str, "%I:%M%p") timeslots = [] #loop through all days, 2 characters at a time while len(all_days) > 0: day_abbr = all_days[-2:] all_days = all_days[:-2] weekday = e_or_n(cc.DayOfWeek, abbreviation=day_abbr) weekday.save() timeslot_attributes = { 'day_of_week': weekday, 'start_time': start_time, 'end_time': end_time } timeslot = e_or_n(cc.Timeslot, **timeslot_attributes) timeslot.save(was_scraped=True) timeslots.append(timeslot) return timeslots
def build_section(self, header_link, component_table, course, term): section = self.section_from_header_link(header_link, course, term) section.save() component_rows = component_table.find_all('tr', {}) # Remove the header row del (component_rows[0]) for row in component_rows: values = row.find_all('span') values = [self.clean_HTML(v.get_text()) for v in values] room = values[3] instructors_str = values[4] # start/end dates start_date, end_date = self.date_range(values[5]) instructors = self.instructors_from_string(instructors_str) # Timeslot all_days_offered = values[0] start_time = values[1] end_time = values[2] timeslots = self.build_timeslots(all_days_offered, start_time, end_time) if timeslots is None: # If there's no timeslot, we should still create one component with a TBA timeslot timeslots = [None] attrs = { 'section': section, 'room': room, 'start_date': start_date, 'end_date': end_date, } #Create a section component for each day for timeslot in timeslots: attrs['timeslot'] = timeslot component = e_or_n(cc.SectionComponent, **attrs) component.instructors = instructors component.save(was_scraped=True) return section
def add_requisites(self, enrollment_reqs, course): course_re = r'(?P<abbr>[A-Z]{3,4})\s*(?P<num>\d{3}[AB]?)' itermatches = re.finditer(course_re, enrollment_reqs) for match in itermatches: abbr, num = match.groups() properties = { 'subject_abbr': abbr, 'course_number': num, 'left_index': match.start(), 'right_index': match.end(), 'for_course': course, } req = e_or_n(cc.Requisite, **properties) req.save()
def build_section(self, header_link, component_table, course, term): section = self.section_from_header_link(header_link, course, term) section.save() component_rows = component_table.find_all('tr', {}) # Remove the header row del(component_rows[0]) for row in component_rows: values = row.find_all('span') values = [self.clean_HTML(v.get_text()) for v in values] room = values[3] instructors_str = values[4] # start/end dates start_date, end_date = self.date_range(values[5]) instructors = self.instructors_from_string(instructors_str) # Timeslot all_days_offered = values[0] start_time = values[1] end_time = values[2] timeslots = self.build_timeslots(all_days_offered, start_time, end_time) if timeslots is None: # If there's no timeslot, we should still create one component with a TBA timeslot timeslots = [None] attrs = { 'section': section, 'room': room, 'start_date': start_date, 'end_date': end_date, } #Create a section component for each day for timeslot in timeslots: attrs['timeslot'] = timeslot component = e_or_n(cc.SectionComponent, **attrs) component.instructors = instructors component.save(was_scraped=True) return section
def subject_from_dropdown(self, subject_index): """Returns the subject on the dropdown with name "link_name" on the current alphanum's page, or none if the dropdown does not exist""" link_name = self._subject_link_name(subject_index) dropdown_link = self.soup.find("a", { "name" : link_name }) if not dropdown_link: # Doesn't exist return None # Extract the subject title and abbreviation m = re.search("^([^-]*) - (.*)$", dropdown_link.get_text().strip()) subject_abbr = m.group(1) subject_title = m.group(2) subject = e_or_n(cc.Subject, title=subject_title, abbreviation=subject_abbr) # Store the link name so we can click on it later subject.click_action = link_name return subject
def instructors_from_string(self, instructors_str): """Returns a list of instructors built out of a comma separated list of instructors""" instructors = [] if instructors_str and instructors_str != "TBA" and instructors_str != "Staff": # Split the list on every comma (one between profs, one after last names) fragments = re.sub(r'\s+', ' ', instructors_str).split(",") fragments = [l.strip() for l in fragments] # Associate every pair of fragments as a full name for i in range(0, len(fragments), 2): last_name = fragments[i] other_names = fragments[i + 1] full_name = u"%s, %s" % (last_name, other_names) instructor = e_or_n(cc.Instructor, name=full_name) instructor.save(was_scraped=True) instructors.append(instructor) return instructors
def instructors_from_string(self, instructors_str): """Returns a list of instructors built out of a comma separated list of instructors""" instructors = [] if instructors_str and instructors_str != "TBA" and instructors_str != "Staff": # Split the list on every comma (one between profs, one after last names) fragments = re.sub(r'\s+', ' ', instructors_str).split(",") fragments = [l.strip() for l in fragments] # Associate every pair of fragments as a full name for i in range(0, len(fragments), 2): last_name = fragments[i] other_names = fragments[i+1] full_name = u"%s, %s" % (last_name, other_names) instructor = e_or_n(cc.Instructor, name=full_name) instructor.save(was_scraped=True) instructors.append(instructor) return instructors