def load_acm_file(department, data_file): while True: title = data_file.readline().rstrip('\n') description_raw = data_file.readline().rstrip('\n') if not description_raw: break # Clean the description description = clean(description_raw) if description is None: continue # Generate the appropriate course object. new_course = Course( number=0, # blank out course numbers title=title, description=description, description_raw=description_raw) department.courses.append(new_course)
def scrape(args): """ Scrape the available data from the AU catalog into a database. """ import logging log = logging.getLogger("root") log.info("Scraping American University data.") # Constant values. catalog_index_url = "http://catalog.american.edu/preview_course_incoming.php?prefix=%s&catoid=3" course_url = "http://catalog.american.edu/preview_course.php?catoid=%s&coid=%s" # Regex to select only the catid and coid. catid_re = re.compile("(^[^']*)|(, this.*$)|(')") # Regex to isolate prerequisite course titles. prereq_re = re.compile("([A-Z]{3,4})(-)(\d{3})") # List of prefixes from the META object. prefixes = [department.get("abbreviation").lower() for department in META.get("departments")] # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University).filter(University.name == university).first() departments = { department.abbreviation.lower(): department for department in session.query(Department).filter(Department.university == university).all() } prereq_dict = {} # Dictionary of Course : Prereq match list for prefix in prefixes: if args.cs and prefix not in ["csc"]: continue catalog_index = requests.get(catalog_index_url % prefix) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all(name="a", onclick=re.compile("showCourse.*")) # Identify relevant information for each course. for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) cnum = full_title[0].split("-")[1] title = " ".join(full_title[1:]) # Identify coid to get description. onclick = course["onclick"] (catid, coid) = re.sub(catid_re, "", onclick).split(", ") # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.find(class_="block_content_popup").hr.br # Remove garbage [div.extract() for div in content.find_all("div")] # Grab extra data extra = "" em = content.find("em") while em: if isinstance(em, NavigableString): extra += em else: extra += em.text rem = em em = em.next_sibling rem.extract() extra.replace("\n", "").replace("\xa0", "").strip(" ") # Clean up the description description_raw = content.text.replace("\n", "").replace("\xa0", "").strip(" ") description = clean(description_raw) if description is None: continue # Identify prerequisites prereq_index = extra.find("requis") prereq_list = None if prereq_index > -1: matches = prereq_re.findall(extra[prereq_index:]) if len(matches) > 0: # Split them up as a dict and store them in a list. prereq_list = [{"d": match[0], "n": match[-1]} for match in matches] # department # number # Generate the appropriate course object. new_course = Course(number=cnum, title=title, description=description, description_raw=description_raw) departments[prefix.lower()].courses.append(new_course) # Add in the requested list of prereqs if found. if prereq_list is not None: prereq_dict[new_course] = prereq_list # Iterate over the list of courses, now that they've been created, and # process their list of requested prerequisites. for course, prereq_list in prereq_dict.items(): # Skip any courses with a 0-length requested prereq list. if len(prereq_list) == 0: continue log.debug(course) log.debug(prereq_list) # Loop over set of prereqs, if there are multiple. department_stack = [] for prereq in prereq_list: n = prereq.get("n") # prereq course number d = prereq.get("d") # prereq course department abbreviation if d.startswith("OR ") or d.startswith("ND "): d = d[3:] # If this is a referential prereq, look up the last course # observed and hope it's the correct department prefix. try: if d in ["and", "or", ","]: d = department_stack[-1] department_stack.append(d) except IndexError: # no previous courses continue log.debug("Searching for: %s %s" % (d, n)) # Reference the prerequisite course identified by this # department abbreviation and course number. prereq_course = ( session.query(Course) .join(Department) .filter(Department.university == university) .filter(func.lower(Department.abbreviation) == d.lower()) .filter(Course.number == int(n)) .first() ) # If a valid course was found, tack it onto the prereq list of # the requesting course (course). if prereq_course and prereq_course is not course: course.prerequisites.append(prereq_course) else: log.debug("Failed to find course matching '%s %s'." % (d, n)) log.info("Completed scraping.")
def scrape(args): """ Scrape the available syllabi from the GMU CS page into a local directory. """ import logging log = logging.getLogger("root") log.info( "Scraping GMU data." ) # Constant values. catalog_index_url = "http://catalog.gmu.edu/preview_course_incoming.php?cattype=combined&prefix=%s" course_url = "http://catalog.gmu.edu/preview_course.php?catoid=%s&coid=%s" # Regex to select only the catid and coid. catid_re = re.compile("(^[^']*)|(, this.*$)|(')") # Regex to isolate prerequisite course titles. prereq_re = re.compile("([A-Za-z,]{2,4})(\s|\\\\xa0)(\d{3})") # List of prefixes from the META object. prefixes = [department.get("abbreviation").lower() for department in META.get("departments")] # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} prereq_dict = {} # Dictionary of Course : Prereq match list for prefix in prefixes: if args.cs and prefix not in ["cs", "ait"]: continue catalog_index = requests.get(catalog_index_url % prefix) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all( name="a", onclick=re.compile("showCourse.*")) # Identify relevant information for each course. for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) cnum = full_title[1] title = ' '.join(full_title[3:]) # Identify coid to get description. onclick = course['onclick'] (catid, coid) = re.sub(catid_re, "", onclick).split(", ") # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.find(class_="block_content_popup").hr.text # Clean up the description. description = content try: description = description[:description.index("Hours of Lecture")] except: pass # Identify prerequisites prereq_index = description.find("Prerequisite(s)") prereq_list = None if prereq_index > -1: # Grab the substring of prereqs and find matches. notes_index = description.find("Notes") prereq_string = description[prereq_index:notes_index] description = description[:prereq_index] matches = prereq_re.findall(prereq_string) if len(matches) > 0: # Split them up as a dict and store them in a list. prereq_list = [{ "d": match[0], # department "n": match[2] # number } for match in matches] # Clean the description string description_raw = description description = clean(description) if description is None: continue # Generate the appropriate course object. new_course = Course( number=cnum, title=title, description=description, description_raw=description_raw) departments[prefix.lower()].courses.append(new_course) # Add in the requested list of prereqs if found. if prereq_list is not None: prereq_dict[new_course] = prereq_list # Iterate over the list of courses, now that they've been created, and # process their list of requested prerequisites. for course, prereq_list in prereq_dict.items(): # Skip any courses with a 0-length requested prereq list. if len(prereq_list) == 0: continue log.debug(course) log.debug(prereq_list) # Loop over set of prereqs, if there are multiple. department_stack = [] for prereq in prereq_list: n = prereq.get("n") # prereq course number d = prereq.get("d") # prereq course department abbreviation # If this is a referential prereq, look up the last course # observed and hope it's the correct department prefix. try: if d in ["and", "or", ","]: d = department_stack[-1] department_stack.append(d) except IndexError: # no previous courses continue log.debug("Searching for: %s %s" % (d, n)) # Reference the prerequisite course identified by this # department abbreviation and course number. prereq_course = session.query(Course) \ .join(Department) \ .filter(Department.university==university) \ .filter(func.lower(Department.abbreviation)==d.lower()) \ .filter(Course.number==int(n)) \ .first() # If a valid course was found, tack it onto the prereq list of # the requesting course (course). if prereq_course and prereq_course is not course: course.prerequisites.append(prereq_course) else: log.debug("Failed to find course matching '%s %s'." % (d, n)) log.info("Completed scraping.")
def scrape(args): """ Scrape the available syllabi from the Utah catalog into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping Utah data.") # Constant values. catalog_index_url = "http://catalog.utah.edu/preview_course_incoming.php?prefix=%s&catoid=6" course_url = "http://catalog.utah.edu/preview_course.php?catoid=%s&coid=%s" # Regex to select only the catid and coid. catid_re = re.compile("(^[^']*)|(, this.*$)|(')") # Regex to isolate prerequisite course titles. prereq_re = re.compile( "(([A-Z]{2,5})|([A-Z]{2}(\s|\\\\xa0)[A-Z]{2})|(H(\s|\\\\xa0)[A-Z]{3}))(\s|\\\\xa0)(\d{4})" ) # List of prefixes from the META object. prefixes = [ department.get("abbreviation").lower() for department in META.get("departments") ] # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} prereq_dict = {} # Dictionary of Course : Prereq match list for prefix in prefixes: if args.cs and prefix not in ["cs"]: continue catalog_index = requests.get(catalog_index_url % prefix) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all(name="a", onclick=re.compile("showCourse.*")) # Identify relevant information for each course. for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) cnum = full_title[1] title = ' '.join(full_title[3:]) # Identify coid to get description. onclick = course['onclick'] (catid, coid) = re.sub(catid_re, "", onclick).split(", ") # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.find(class_="block_content_popup").hr # Identify all the course heading data. strongs = content.find_all("strong") headings = {} tag = content for strong in strongs: tag = strong.next_sibling text = "" while True: if tag.name == "br": break if isinstance(tag, NavigableString): text += tag else: text += tag.text tag = tag.next_sibling headings[strong.text] = text # Update the cursor to post-heading data. content = tag # Remove the footer links [a.extract() for a in tag.find_all("a")] # Clean up the description description_raw = content.text.replace('\n', '') description = clean(description_raw) if description is None: continue # Identify prerequisites prereq_string = headings.get("Enrollment Requirement:") prereq_list = None if prereq_string is not None: matches = prereq_re.findall(prereq_string) if len(matches) > 0: # Split them up as a dict and store them in a list. prereq_list = [ { "d": match[0], # department "n": match[-1] # number } for match in matches ] # Generate the appropriate course object. new_course = Course(number=cnum, title=title, description=description, description_raw=description_raw) departments[prefix.lower()].courses.append(new_course) # Add in the requested list of prereqs if found. if prereq_list is not None: prereq_dict[new_course] = prereq_list # Iterate over the list of courses, now that they've been created, and # process their list of requested prerequisites. for course, prereq_list in prereq_dict.items(): # Skip any courses with a 0-length requested prereq list. if len(prereq_list) == 0: continue log.debug(course) log.debug(prereq_list) # Loop over set of prereqs, if there are multiple. department_stack = [] for prereq in prereq_list: n = prereq.get("n") # prereq course number d = prereq.get("d") # prereq course department abbreviation if d.startswith("OR ") or d.startswith("ND "): d = d[3:] # If this is a referential prereq, look up the last course # observed and hope it's the correct department prefix. try: if d in ["and", "or", ","]: d = department_stack[-1] department_stack.append(d) except IndexError: # no previous courses continue log.debug("Searching for: %s %s" % (d, n)) # Reference the prerequisite course identified by this # department abbreviation and course number. prereq_course = session.query(Course) \ .join(Department) \ .filter(Department.university==university) \ .filter(func.lower(Department.abbreviation)==d.lower()) \ .filter(Course.number==int(n)) \ .first() # If a valid course was found, tack it onto the prereq list of # the requesting course (course). if prereq_course and prereq_course is not course: course.prerequisites.append(prereq_course) else: log.debug("Failed to find course matching '%s %s'." % (d, n)) log.info("Completed scraping.")
def scrape(args): """ Scrape the available syllabi from the PDX CS page into a local directory. """ import logging log = logging.getLogger("root") log.info( "Scraping PDX CS data." ) # Construct a soup of the index. course_index_url = "http://www.pdx.edu/computer-science/courses" course_index = requests.get(course_index_url) soup = BeautifulSoup(course_index.text) # Identify the list of courses. course_list = soup.find("h3", text="Course List")\ .find_next_sibling("ul")\ .find_all("a") # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} for course in course_list: log.debug(course.text) full_title = re.compile("\s+").split(course.text) prefix = full_title[0] cnum = re.sub('[/]', '-', full_title[1]) title = ' '.join(full_title[2:]) if args.cs and prefix.lower() not in ["cs"]: continue try: course_url = course['href'] course_soup = BeautifulSoup(requests.get(course_url).text) except: log.warn("Unable to parse course page.") log.warn(course_url) continue # Find the course description based on its neighbour cdesc_re = re.compile(".*Course Description.*") cdesc = course_soup.find("table").find(text=cdesc_re) if not cdesc.next_sibling: cdesc = cdesc.find_parent("td") # If there's no course description found, well, forget it try: description = cdesc.find_next_sibling("td").text except: log.warn("No course description available.") log.warn(course_url) continue # Clean the description string. description_raw = description description = clean(description) if description is None: continue # Find the course prerequisite list based on its neighbour prereq_re = re.compile(".*Prerequi?sites.*") prereq = course_soup.find("table").find(text=prereq_re) if not prereq.next_sibling: prereq = prereq.find_parent("td") # If there's no prereq list found, leave it as None try: prereq = prereq.find_next_sibling("td").text except: prereq = None # Generate the appropriate course object. departments[prefix.lower()].courses.append(Course( number=cnum, title=title, description_raw=description_raw, description=description)) log.info( "Completed scraping." )
def scrape(args): """ Scrape the available syllabi from the SC CS page into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping SC CS data.") # Generate a BeautifulSoup object. catalog_index_url = "http://bulletin.sc.edu/content.php?catoid=36&navoid=4242&filter[27]=CSCE" catalog_index = requests.get(catalog_index_url) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all( name="a", onclick=re.compile("showCourse.*")) # Select only the catoid and coid. catoid_re = re.compile("(?<=catoid=)\d+") coid_re = re.compile("(?<=coid=)\d+") # Piecewise course page url. course_url = "http://bulletin.sc.edu/preview_course.php?catoid=%s&coid=%s" # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} # Identify relevant information for each course. prereqs = {} for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) prefix = full_title[0] cnum = full_title[1] title = ' '.join(full_title[3:]) title = title.replace("'", "") # Identify coid to get description. href = course['href'] catoid = catoid_re.search(href).group(0) coid = coid_re.search(href).group(0) # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catoid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.h1.next_sibling.next_sibling.text # Clean up the description. def strip_substring(body, substring): try: return body[:body.index(substring)] except: return body # Clean the content string content = strip_substring(content, "Print-Friendly Page Close Window") # Clean the description string description_raw = content description_raw = strip_substring(description_raw, "Prereq") description_raw = strip_substring(description_raw, "Coreq") description_raw = strip_substring(description_raw, "Note:") description_raw = strip_substring(description_raw, "Cross-listed") description = clean(description_raw) if description is None: continue # Generate the appropriate course object. departments[prefix.lower()].courses.append(Course( number=cnum, title=title, description_raw=description_raw, description=description)) log.info( "Completed scraping." )
def scrape(args): """ Scrape the available syllabi from the LSU CS page into a local directory. """ import logging log = logging.getLogger("root") log.info( "Scraping LSU CS data." ) # Generate a BeautifulSoup object. catalog_index_url = "http://catalog.lsu.edu/content.php?filter[27]=CSC&cur_cat_oid=6&navoid=538" catalog_index = requests.get( catalog_index_url ) soup = BeautifulSoup( catalog_index.text ) # Identify the list of courses. course_list = soup.find_all( name="a", onclick=re.compile("showCourse.*")) # Select only the catoid and coid. catoid_re = re.compile("(?<=catoid=)\d+") coid_re = re.compile("(?<=coid=)\d+") # Piecewise course page url. course_url = "http://catalog.lsu.edu/preview_course_nopop.php?catoid=%s&coid=%s" # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} # Identify relevant information for each course. prereqs = {} for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) prefix = full_title[0] cnum = full_title[1] title = ' '.join(full_title[2:-1]) title = title.replace("'", "") # Identify coid to get description. href = course['href'] catoid = catoid_re.search(href).group(0) coid = coid_re.search(href).group(0) # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catoid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.find(class_="block_content").hr # Remove the metadata. [em.decompose() for em in content.find_all("em") if True] # Clean the description string description_raw = content.text description = clean(content.text) if description is None: continue # Generate the appropriate course object. departments[prefix.lower()].courses.append(Course( number=cnum, title=title, description_raw=description_raw, description=description)) log.info( "Completed scraping." )
def scrape(args): """ Scrape the available syllabi from the UTK catalog into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping UTK data.") # Constant values. catalog_index_url = "http://catalog.utk.edu/preview_course_incoming.php?prefix=%s&catoid=18" course_url = "http://catalog.utk.edu/preview_course.php?catoid=%s&coid=%s" # Regex to select only the catid and coid. catid_re = re.compile("(^[^']*)|(, this.*$)|(')") # Regex to isolate prerequisite course titles. prereq_re = re.compile("(\s|\\\\xa0)(\d{3})") # List of prefixes from the META object. prefixes = [ department.get("abbreviation").lower() for department in META.get("departments") ] # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} prereq_dict = {} # Dictionary of Course : Prereq match list for prefix in prefixes: if args.cs and prefix not in ["cosc"]: continue catalog_index = requests.get(catalog_index_url % prefix) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all(name="a", onclick=re.compile("showCourse.*")) # Identify relevant information for each course. for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) cnum = full_title[1] title = ' '.join(full_title[3:]) # Identify coid to get description. onclick = course['onclick'] (catid, coid) = re.sub(catid_re, "", onclick).split(", ") # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.find(class_="block_content_popup").hr # Clean out garbage [div.extract() for div in content.find_all("div")] extra = ' '.join( [em.extract().text for em in content.find_all("em")]) extra.strip(' ') # Clean up the description description_raw = content.text.replace('\n', '').strip(' ') description = clean(description_raw) if description is None: continue # Identify prerequisites prereq_index = extra.find("requisite") prereq_list = None if prereq_index > -1: prereq_string = extra[prereq_index:] matches = prereq_re.findall(prereq_string) if len(matches) > 0: # Split them up as a dict and store them in a list. # Naiive assumption that every course number is within # the same department because the department titles are # written out and not matchable. prereq_list = [ { "d": prefix.lower(), # department "n": match[-1] # number } for match in matches ] # Generate the appropriate course object. new_course = Course(number=cnum, title=title, description=description, description_raw=description_raw) departments[prefix.lower()].courses.append(new_course) # Add in the requested list of prereqs if found. if prereq_list is not None: prereq_dict[new_course] = prereq_list # Iterate over the list of courses, now that they've been created, and # process their list of requested prerequisites. for course, prereq_list in prereq_dict.items(): # Skip any courses with a 0-length requested prereq list. if len(prereq_list) == 0: continue log.debug(course) log.debug(prereq_list) # Loop over set of prereqs, if there are multiple. department_stack = [] for prereq in prereq_list: n = prereq.get("n") # prereq course number d = prereq.get("d") # prereq course department abbreviation log.debug("Searching for: %s %s" % (d, n)) # Reference the prerequisite course identified by this # department abbreviation and course number. prereq_course = session.query(Course) \ .join(Department) \ .filter(Department.university==university) \ .filter(func.lower(Department.abbreviation)==d.lower()) \ .filter(Course.number==int(n)) \ .first() # If a valid course was found, tack it onto the prereq list of # the requesting course (course). if prereq_course and prereq_course is not course: course.prerequisites.append(prereq_course) else: log.debug("Failed to find course matching '%s %s'." % (d, n)) log.info("Completed scraping.")
def scrape(args): """ Scrape the available syllabi from the RPI CS page into a local directory. """ import logging log = logging.getLogger("root") log.info( "Scraping RPI CS data." ) # Generate a BeautifulSoup object. catalog_index_url = "http://catalog.rpi.edu/content.php?filter[27]=CSCI&cur_cat_oid=13&navoid=313" catalog_index = requests.get( catalog_index_url ) soup = BeautifulSoup( catalog_index.text ) # Identify the list of courses. course_list = soup.find_all( name="a", onclick=re.compile("showCourse.*")) # Select only the catoid and coid. catoid_re = re.compile("(?<=catoid=)\d+") coid_re = re.compile("(?<=coid=)\d+") # Piecewise course page url. course_url = "http://catalog.rpi.edu/preview_course_nopop.php?catoid=%s&coid=%s" # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} # Identify relevant information for each course. prereqs = {} for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) prefix = full_title[0] cnum = full_title[1] title = ' '.join(full_title[3:]) title = title.replace("'", "") # Identify coid to get description. href = course['href'] catoid = catoid_re.search(href).group(0) coid = coid_re.search(href).group(0) # Generate a BeautifulSoup object of the course description. course_page = requests.get( course_url % (catoid, coid) ) course_soup = BeautifulSoup( course_page.text ) content = course_soup.find(class_="block_content").hr.text # Clean up the description. description = content try: description = description[:description.index("Credit Hours")] description = description[:description.index("When Offered")] except: pass # Identify prerequisites # TODO: Match these up with their database entries. prereq_index = description.find("Prerequisit") if prereq_index > -1: prereq_string = description[prereq_index:] description = description[:prereq_index] prereq_re = re.compile("\w{2,4}\s\d{3}") matches = re.findall(prereq_re, prereq_string) if len(matches) > 0: prereqs["%s %s" % (prefix, cnum)] = matches # Clean the description string description_raw = description description = clean(description) if description is None: continue # Generate the appropriate course object. departments[prefix.lower()].courses.append(Course( number=cnum, title=title, description_raw=description_raw, description=description)) log.info( "Completed scraping." )
def scrape(args): """ Scrape the available syllabi from the Stanford CS page into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping Stanford CS data.") # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} # Static connection information catalog_url = "https://explorecourses.stanford.edu/search?q=CS&view=catalog&filter-departmentcode-CS=on&filter-term-Spring=on&filter-coursestatus-Active=on&page=" catalog_page = 0 catalog_page_limit = 8 headers = { "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36\ (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" } # Loop forever until we get to the last page and can't "next >" any # more, in which case we stop. while True: # There are currently only 8 pages, so break after we see that # many. if catalog_page == catalog_page_limit: break # Generate a BeautifulSoup object. response = requests.get(catalog_url + str(catalog_page), headers=headers) soup = BeautifulSoup(response.text) # Identify the course list. course_list = soup.find_all(class_="searchResult") # Identify relevant information for each course. for course in course_list: # Generate metadata title = course.find(class_="courseTitle").text identifier = re.compile("\s+").split( course.find(class_="courseNumber").text) prefix = identifier[0] cnum = identifier[1][:-1] description = course.find(class_="courseDescription").text log.debug(identifier) # Identify prerequisites or corequisites. # TODO: Match these up with their database entries. prereq_index = description.find("Prerequisite") if prereq_index > -1: prereq_string = description[prereq_index:] description = description[:prereq_index] # Clean the description string description_raw = description description = clean(description) if description is None: continue # Generate the appropriate course object. departments[prefix.lower()].courses.append(Course( number=cnum, title=title, description_raw=description_raw, description=description)) # Go to the next page. catalog_page = catalog_page + 1 log.info( "Completed scraping." )
def scrape(args): """ Scrape the available syllabi from the RPI CS page into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping RPI CS data.") # Generate a BeautifulSoup object. catalog_index_url = "http://catalog.rpi.edu/content.php?filter[27]=CSCI&cur_cat_oid=13&navoid=313" catalog_index = requests.get(catalog_index_url) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all(name="a", onclick=re.compile("showCourse.*")) # Select only the catoid and coid. catoid_re = re.compile("(?<=catoid=)\d+") coid_re = re.compile("(?<=coid=)\d+") # Piecewise course page url. course_url = "http://catalog.rpi.edu/preview_course_nopop.php?catoid=%s&coid=%s" # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} # Identify relevant information for each course. prereqs = {} for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) prefix = full_title[0] cnum = full_title[1] title = ' '.join(full_title[3:]) title = title.replace("'", "") # Identify coid to get description. href = course['href'] catoid = catoid_re.search(href).group(0) coid = coid_re.search(href).group(0) # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catoid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.find(class_="block_content").hr.text # Clean up the description. description = content try: description = description[:description.index("Credit Hours")] description = description[:description.index("When Offered")] except: pass # Identify prerequisites # TODO: Match these up with their database entries. prereq_index = description.find("Prerequisit") if prereq_index > -1: prereq_string = description[prereq_index:] description = description[:prereq_index] prereq_re = re.compile("\w{2,4}\s\d{3}") matches = re.findall(prereq_re, prereq_string) if len(matches) > 0: prereqs["%s %s" % (prefix, cnum)] = matches # Clean the description string description_raw = description description = clean(description) if description is None: continue # Generate the appropriate course object. departments[prefix.lower()].courses.append( Course(number=cnum, title=title, description_raw=description_raw, description=description)) log.info("Completed scraping.")
def scrape(args): """ Scrape the available syllabi from the KSU CS page into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping KSU data.") # Constant values. catalog_index_url = "http://catalog.k-state.edu/content.php?filter[27]=%s&cur_cat_oid=13&navoid=1425" course_url = "http://catalog.k-state.edu/preview_course_nopop.php?catoid=%s&coid=%s" # Scraper regexes. catoid_re = re.compile("(?<=catoid=)\d+") coid_re = re.compile("(?<=coid=)\d+") prereq_re = re.compile("([A-Za-z,]{2,5})(\s|\\\\xa0)(\d{3})") # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} prereq_dict = {} # Dictionary of Course : Prereq match list for prefix in departments.keys(): if args.cs and prefix not in ["cis"]: continue catalog_index = requests.get(catalog_index_url % prefix) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all(name="a", onclick=re.compile("showCourse.*")) # Identify relevant information for each course. for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) #prefix = full_title[0] cnum = full_title[1] title = ' '.join(full_title[3:]) title = title.replace("'", "") # Identify coid to get description. href = course['href'] catoid = catoid_re.search(href).group(0) coid = coid_re.search(href).group(0) # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catoid, coid)) course_soup = BeautifulSoup(course_page.text) # Identify the components of the description and its metadata. result_set = course_soup.find(class_="block_content") \ .table \ .find_next_sibling("p") \ .h1 \ .find_next_siblings() # Join them together as text. content = ' '.join([r.text for r in result_set[1:]]) # Clean up the description. def strip_substring(body, substring): try: return body[:body.index(substring)] except: return body description = content description = strip_substring(description, "Note") description = strip_substring(description, "Requisites") description = strip_substring(description, "When Offered") description = strip_substring(description, "UGE course") # Identify prerequisites. prereq_index = content.find("Requisites") prereq_list = None if prereq_index > -1: # Grab the substring of prereqs and find matches. prereq_string = content[prereq_index:] prereq_string = strip_substring(prereq_string, "Note") prereq_string = strip_substring(prereq_string, "When Offered") matches = prereq_re.findall(prereq_string) if len(matches) > 0: # Split them up as a dict and store them in a list. prereq_list = [ { "d": match[0], # department "n": match[2] # number } for match in matches ] # Clean the description string description_raw = description description = clean(description_raw) if description is None: continue # Generate the appropriate course object. new_course = Course(number=cnum, title=title, description=description, description_raw=description_raw) departments[prefix].courses.append(new_course) # Add in the requested list of prereqs if found. if prereq_list is not None: prereq_dict[new_course] = prereq_list # Iterate over the list of courses, now that they've been created, and # process their list of requested prerequisites. for course, prereq_list in prereq_dict.items(): # Skip any courses with a 0-length requested prereq list. if len(prereq_list) == 0: continue log.debug(course) log.debug(prereq_list) # Loop over set of prereqs, if there are multiple. department_stack = [] for prereq in prereq_list: n = prereq.get("n") # prereq course number d = prereq.get("d") # prereq course department abbreviation # If this is a referential prereq, look up the last course # observed and hope it's the correct department prefix. try: if d in ["and", "or", ","]: d = department_stack[-1] department_stack.append(d) except IndexError: # no previous courses continue log.debug("Searching for: %s %s" % (d, n)) # Reference the prerequisite course identified by this # department abbreviation and course number. prereq_course = session.query(Course) \ .join(Department) \ .filter(Department.university==university) \ .filter(func.lower(Department.abbreviation)==d.lower()) \ .filter(Course.number==int(n)) \ .first() # If a valid course was found, tack it onto the prereq list of # the requesting course (course). if prereq_course and prereq_course is not course: course.prerequisites.append(prereq_course) else: log.debug("Failed to find course matching '%s %s'." % (d, n)) log.info("Completed scraping.")
def scrape(args): """ Scrape the available syllabi from the Utah catalog into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping Utah data.") # Constant values. catalog_index_url = "http://catalog.utah.edu/preview_course_incoming.php?prefix=%s&catoid=6" course_url = "http://catalog.utah.edu/preview_course.php?catoid=%s&coid=%s" # Regex to select only the catid and coid. catid_re = re.compile("(^[^']*)|(, this.*$)|(')") # Regex to isolate prerequisite course titles. prereq_re = re.compile("(([A-Z]{2,5})|([A-Z]{2}(\s|\\\\xa0)[A-Z]{2})|(H(\s|\\\\xa0)[A-Z]{3}))(\s|\\\\xa0)(\d{4})") # List of prefixes from the META object. prefixes = [department.get("abbreviation").lower() for department in META.get("departments")] # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} prereq_dict = {} # Dictionary of Course : Prereq match list for prefix in prefixes: if args.cs and prefix not in ["cs"]: continue catalog_index = requests.get(catalog_index_url % prefix) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all( name="a", onclick=re.compile("showCourse.*")) # Identify relevant information for each course. for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) cnum = full_title[1] title = ' '.join(full_title[3:]) # Identify coid to get description. onclick = course['onclick'] (catid, coid) = re.sub(catid_re, "", onclick).split(", ") # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.find(class_="block_content_popup").hr # Identify all the course heading data. strongs = content.find_all("strong") headings = {} tag = content for strong in strongs: tag = strong.next_sibling text = "" while True: if tag.name == "br": break if isinstance(tag, NavigableString): text += tag else: text += tag.text tag = tag.next_sibling headings[strong.text] = text # Update the cursor to post-heading data. content = tag # Remove the footer links [a.extract() for a in tag.find_all("a")] # Clean up the description description_raw = content.text.replace('\n', '') description = clean(description_raw) if description is None: continue # Identify prerequisites prereq_string = headings.get("Enrollment Requirement:") prereq_list = None if prereq_string is not None: matches = prereq_re.findall(prereq_string) if len(matches) > 0: # Split them up as a dict and store them in a list. prereq_list = [{ "d": match[0], # department "n": match[-1] # number } for match in matches] # Generate the appropriate course object. new_course = Course( number=cnum, title=title, description=description, description_raw=description_raw) departments[prefix.lower()].courses.append(new_course) # Add in the requested list of prereqs if found. if prereq_list is not None: prereq_dict[new_course] = prereq_list # Iterate over the list of courses, now that they've been created, and # process their list of requested prerequisites. for course, prereq_list in prereq_dict.items(): # Skip any courses with a 0-length requested prereq list. if len(prereq_list) == 0: continue log.debug(course) log.debug(prereq_list) # Loop over set of prereqs, if there are multiple. department_stack = [] for prereq in prereq_list: n = prereq.get("n") # prereq course number d = prereq.get("d") # prereq course department abbreviation if d.startswith("OR ") or d.startswith("ND "): d = d[3:] # If this is a referential prereq, look up the last course # observed and hope it's the correct department prefix. try: if d in ["and", "or", ","]: d = department_stack[-1] department_stack.append(d) except IndexError: # no previous courses continue log.debug("Searching for: %s %s" % (d, n)) # Reference the prerequisite course identified by this # department abbreviation and course number. prereq_course = session.query(Course) \ .join(Department) \ .filter(Department.university==university) \ .filter(func.lower(Department.abbreviation)==d.lower()) \ .filter(Course.number==int(n)) \ .first() # If a valid course was found, tack it onto the prereq list of # the requesting course (course). if prereq_course and prereq_course is not course: course.prerequisites.append(prereq_course) else: log.debug("Failed to find course matching '%s %s'." % (d, n)) log.info("Completed scraping.")
def scrape(args): """ Scrape the available syllabi from the KSU CS page into a local directory. """ import logging log = logging.getLogger("root") log.info( "Scraping KSU data." ) # Constant values. catalog_index_url = "http://catalog.k-state.edu/content.php?filter[27]=%s&cur_cat_oid=13&navoid=1425" course_url = "http://catalog.k-state.edu/preview_course_nopop.php?catoid=%s&coid=%s" # Scraper regexes. catoid_re = re.compile("(?<=catoid=)\d+") coid_re = re.compile("(?<=coid=)\d+") prereq_re = re.compile("([A-Za-z,]{2,5})(\s|\\\\xa0)(\d{3})") # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} prereq_dict = {} # Dictionary of Course : Prereq match list for prefix in departments.keys(): if args.cs and prefix not in ["cis"]: continue catalog_index = requests.get(catalog_index_url % prefix) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all( name="a", onclick=re.compile("showCourse.*")) # Identify relevant information for each course. for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) #prefix = full_title[0] cnum = full_title[1] title = ' '.join(full_title[3:]) title = title.replace("'", "") # Identify coid to get description. href = course['href'] catoid = catoid_re.search(href).group(0) coid = coid_re.search(href).group(0) # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catoid, coid)) course_soup = BeautifulSoup(course_page.text) # Identify the components of the description and its metadata. result_set = course_soup.find(class_="block_content") \ .table \ .find_next_sibling("p") \ .h1 \ .find_next_siblings() # Join them together as text. content = ' '.join([r.text for r in result_set[1:]]) # Clean up the description. def strip_substring(body, substring): try: return body[:body.index(substring)] except: return body description = content description = strip_substring(description, "Note") description = strip_substring(description, "Requisites") description = strip_substring(description, "When Offered") description = strip_substring(description, "UGE course") # Identify prerequisites. prereq_index = content.find("Requisites") prereq_list = None if prereq_index > -1: # Grab the substring of prereqs and find matches. prereq_string = content[prereq_index:] prereq_string = strip_substring(prereq_string, "Note") prereq_string = strip_substring(prereq_string, "When Offered") matches = prereq_re.findall(prereq_string) if len(matches) > 0: # Split them up as a dict and store them in a list. prereq_list = [{ "d": match[0], # department "n": match[2] # number } for match in matches] # Clean the description string description_raw = description description = clean(description_raw) if description is None: continue # Generate the appropriate course object. new_course = Course( number=cnum, title=title, description=description, description_raw=description_raw) departments[prefix].courses.append(new_course) # Add in the requested list of prereqs if found. if prereq_list is not None: prereq_dict[new_course] = prereq_list # Iterate over the list of courses, now that they've been created, and # process their list of requested prerequisites. for course, prereq_list in prereq_dict.items(): # Skip any courses with a 0-length requested prereq list. if len(prereq_list) == 0: continue log.debug(course) log.debug(prereq_list) # Loop over set of prereqs, if there are multiple. department_stack = [] for prereq in prereq_list: n = prereq.get("n") # prereq course number d = prereq.get("d") # prereq course department abbreviation # If this is a referential prereq, look up the last course # observed and hope it's the correct department prefix. try: if d in ["and", "or", ","]: d = department_stack[-1] department_stack.append(d) except IndexError: # no previous courses continue log.debug("Searching for: %s %s" % (d, n)) # Reference the prerequisite course identified by this # department abbreviation and course number. prereq_course = session.query(Course) \ .join(Department) \ .filter(Department.university==university) \ .filter(func.lower(Department.abbreviation)==d.lower()) \ .filter(Course.number==int(n)) \ .first() # If a valid course was found, tack it onto the prereq list of # the requesting course (course). if prereq_course and prereq_course is not course: course.prerequisites.append(prereq_course) else: log.debug("Failed to find course matching '%s %s'." % (d, n)) log.info("Completed scraping.")
def scrape(args): """ Scrape the available syllabi from the Stanford CS page into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping Stanford CS data.") # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} # Static connection information catalog_url = "https://explorecourses.stanford.edu/search?q=CS&view=catalog&filter-departmentcode-CS=on&filter-term-Spring=on&filter-coursestatus-Active=on&page=" catalog_page = 0 catalog_page_limit = 8 headers = { "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36\ (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" } # Loop forever until we get to the last page and can't "next >" any # more, in which case we stop. while True: # There are currently only 8 pages, so break after we see that # many. if catalog_page == catalog_page_limit: break # Generate a BeautifulSoup object. response = requests.get(catalog_url + str(catalog_page), headers=headers) soup = BeautifulSoup(response.text) # Identify the course list. course_list = soup.find_all(class_="searchResult") # Identify relevant information for each course. for course in course_list: # Generate metadata title = course.find(class_="courseTitle").text identifier = re.compile("\s+").split( course.find(class_="courseNumber").text) prefix = identifier[0] cnum = identifier[1][:-1] description = course.find(class_="courseDescription").text log.debug(identifier) # Identify prerequisites or corequisites. # TODO: Match these up with their database entries. prereq_index = description.find("Prerequisite") if prereq_index > -1: prereq_string = description[prereq_index:] description = description[:prereq_index] # Clean the description string description_raw = description description = clean(description) if description is None: continue # Generate the appropriate course object. departments[prefix.lower()].courses.append( Course(number=cnum, title=title, description_raw=description_raw, description=description)) # Go to the next page. catalog_page = catalog_page + 1 log.info("Completed scraping.")
def scrape(args): """ Scrape the available syllabi from the UTK catalog into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping UTK data.") # Constant values. catalog_index_url = "http://catalog.utk.edu/preview_course_incoming.php?prefix=%s&catoid=18" course_url = "http://catalog.utk.edu/preview_course.php?catoid=%s&coid=%s" # Regex to select only the catid and coid. catid_re = re.compile("(^[^']*)|(, this.*$)|(')") # Regex to isolate prerequisite course titles. prereq_re = re.compile("(\s|\\\\xa0)(\d{3})") # List of prefixes from the META object. prefixes = [department.get("abbreviation").lower() for department in META.get("departments")] # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University).filter(University.name == university).first() departments = { department.abbreviation.lower(): department for department in session.query(Department).filter(Department.university == university).all() } prereq_dict = {} # Dictionary of Course : Prereq match list for prefix in prefixes: if args.cs and prefix not in ["cosc"]: continue catalog_index = requests.get(catalog_index_url % prefix) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all(name="a", onclick=re.compile("showCourse.*")) # Identify relevant information for each course. for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) cnum = full_title[1] title = " ".join(full_title[3:]) # Identify coid to get description. onclick = course["onclick"] (catid, coid) = re.sub(catid_re, "", onclick).split(", ") # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.find(class_="block_content_popup").hr # Clean out garbage [div.extract() for div in content.find_all("div")] extra = " ".join([em.extract().text for em in content.find_all("em")]) extra.strip(" ") # Clean up the description description_raw = content.text.replace("\n", "").strip(" ") description = clean(description_raw) if description is None: continue # Identify prerequisites prereq_index = extra.find("requisite") prereq_list = None if prereq_index > -1: prereq_string = extra[prereq_index:] matches = prereq_re.findall(prereq_string) if len(matches) > 0: # Split them up as a dict and store them in a list. # Naiive assumption that every course number is within # the same department because the department titles are # written out and not matchable. prereq_list = [{"d": prefix.lower(), "n": match[-1]} for match in matches] # department # number # Generate the appropriate course object. new_course = Course(number=cnum, title=title, description=description, description_raw=description_raw) departments[prefix.lower()].courses.append(new_course) # Add in the requested list of prereqs if found. if prereq_list is not None: prereq_dict[new_course] = prereq_list # Iterate over the list of courses, now that they've been created, and # process their list of requested prerequisites. for course, prereq_list in prereq_dict.items(): # Skip any courses with a 0-length requested prereq list. if len(prereq_list) == 0: continue log.debug(course) log.debug(prereq_list) # Loop over set of prereqs, if there are multiple. department_stack = [] for prereq in prereq_list: n = prereq.get("n") # prereq course number d = prereq.get("d") # prereq course department abbreviation log.debug("Searching for: %s %s" % (d, n)) # Reference the prerequisite course identified by this # department abbreviation and course number. prereq_course = ( session.query(Course) .join(Department) .filter(Department.university == university) .filter(func.lower(Department.abbreviation) == d.lower()) .filter(Course.number == int(n)) .first() ) # If a valid course was found, tack it onto the prereq list of # the requesting course (course). if prereq_course and prereq_course is not course: course.prerequisites.append(prereq_course) else: log.debug("Failed to find course matching '%s %s'." % (d, n)) log.info("Completed scraping.")