def import_results(args): """ Read topic data generated from the learn module and store it in the database. """ from trajectory.models import Course, Topic, CourseTopicAssociation from trajectory.models import ResultSet from trajectory.models.meta import session from trajectory import config as TRJ import logging, csv log = logging.getLogger("root") log.info("Begin topic import.") # Create a new result set. result_set = ResultSet( alpha=args.alpha, beta=args.beta, iterations=args.iterations ) session.add(result_set) session.commit() # Add in new topic definitions. with open(args.topic_file, "r") as topic_file: topic_reader = csv.reader(topic_file, delimiter=",") next(topic_reader, None) # skip header topic_count = 0 for topic in topic_reader: topic_count += 1 session.add(Topic( id=topic[0], result_set=result_set, words=', '.join(topic[1:]) )) result_set.num_topics = topic_count # Add the topics to their courses. courses = session.query(Course).all() course_query = session.query(Course) course_by_id = lambda c: course_query.get(c) with open(args.course_file, "r") as course_file: course_reader = csv.reader(course_file, delimiter=",") next(course_reader, None) # skip header topics_to_add = { # {course:[[id, weight], [id, weight], ...]} course_by_id(row[1]) : [ topic.split(':') for topic in row[2:] if float(topic.split(':')[1]) > TRJ.TOPIC_MIN_WEIGHT ] for row in course_reader if course_by_id(row[1]) is not None } for course, topic_list in topics_to_add.items(): for (topicid, proportion) in topic_list: association = CourseTopicAssociation(proportion=proportion) association.topic_id = topicid association.result_set_id = result_set.id course.topics.append(association) log.info("Topic import complete.")
def scrape(args): """ Manipulate the ACM textual data and store it in the database. """ import logging log = logging.getLogger("root") log.info("Scraping ACM data.") # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() def load_acm_file(department, data_file): while True: title = data_file.readline().rstrip('\n') description_raw = data_file.readline().rstrip('\n') if not description_raw: break # Clean the description description = clean(description_raw) if description is None: continue # Generate the appropriate course object. new_course = Course( number=0, # blank out course numbers title=title, description=description, description_raw=description_raw) department.courses.append(new_course) exemplar_courses = session.query(Department)\ .filter(Department.university==university)\ .filter(Department.abbreviation=="EC")\ .first() data_file = open(TRJ.ACM_EXEMPLARS) load_acm_file(exemplar_courses, data_file) data_file.close() knowledge_areas = session.query(Department)\ .filter(Department.university==university)\ .filter(Department.abbreviation=="KA")\ .first() data_file = open(TRJ.ACM_KA) load_acm_file(knowledge_areas, data_file) data_file.close() log.info("Completed scraping.")
def add_tree(G, tree, parent=None): cid = tree[0] # unpack information prereqs = tree[1] # unpack information course = session.query(Course).get(cid) # Insert all known data, including department abbreviation. node_data = row2dict(course) node_data['dept'] = course.department.abbreviation # Identify the primary course in the graph (the requested). if str(cid) == str(course_id): node_data['prime'] = True else: node_data['prime'] = False # If the course has already been added, generate a unique ID for it # based on its parent, and add it anyway. But don't recurse into # its list of prereqs. seen = False if cid in G.nodes(): cid = str(parent) + "-" + str(cid) seen = True # Add course and an edge from its parent, if relevant. G.add_node(cid, node_data) if parent is not None: G.add_edge(parent, cid) # Recurse through the prerequisite tree and add in subtrees. if not seen: for prereq in prereqs: add_tree(G, prereq, cid)
def university_prerequisite_statistics(abbreviation, result_set): uni_courses = session.query(Course).join(Department).join(University).filter(University.abbreviation==abbreviation).all() prereq_distances = [prerequisite_distances(course, result_set) for course in uni_courses] prereq_distances = [p for p in prereq_distances if p] # strip courses with no prerequisites mean = numpy.mean(list(chain.from_iterable(prereq_distances))) stdv = numpy.std(list(chain.from_iterable(prereq_distances))) return (mean, stdv)
def topic_list(item=None, result_set=None): """ Retrieve the ordered list of all topics represented by some item. If no item is requested, return the global list of topics. Optional filter by a result set. item can be a University, Department, or Course. """ from trajectory.models import University, Department, Course, Topic from trajectory.models import CourseTopicAssociation from trajectory.models.meta import session # Generate the initial query. Note that if a result_set instance is # passed in, it will be used to filter the global topics. topics_query = session.query(Topic).order_by(Topic.id) if result_set is not None: topics_query = topics_query.filter(Topic.result_set_id==result_set.id) # If there was no item specifically requested, just return the global # topic set. if item is None: return topics_query.all() # Start constructing the per-item topic list. item_topics = topics_query.join(CourseTopicAssociation).join(Course) # Add any additional filtration for course/department/university. if type(item) == Course: item_topics = item_topics \ .filter(Course.id==item.id) \ .all() elif type(item) == Department: item_topics = item_topics \ .join(Department) \ .filter(Department.id==item.id) \ .all() elif type(item) == University: item_topics = item_topics \ .join(Department) \ .join(University) \ .filter(University.id==item.id) \ .all() else: raise RuntimeError("Unknown item type requested.") return item_topics
def ground_truth_knowledge_areas(course): """ Look up the set of ground truth manually annotated knowledge areas for a course. If none are found, simply return an empty list. """ from trajectory.models import University, Department, Course from trajectory.models.meta import session from trajectory import config as TRJ import json # Handle an empty course request. if course is None: return set([]) # Attempt to retrieve list of KAs from the ground truth set. try: university = TRJ.KA_TRUTH.get(course.department.university.abbreviation) department = university.get(course.department.abbreviation) labels = department.get(str(course.number)) except: return set([]) if labels is None: return set([]) # Kind of a hack, but guarantees the entered abbreviations will be # unique to a knowledge area. labels = ["(%s)" % label for label in labels] # Query database for knowledge areas. knowledge_areas = session.query(Department).join(University)\ .filter(University.abbreviation=="ACM")\ .filter(Department.abbreviation=="KA")\ .first() # Handle case where ACM/KA is not present in the database. if knowledge_areas is None: raise RuntimeError("Knowledge areas not defined.") # This is the list of course objects representing knowledge areas. knowledge_areas = knowledge_areas.courses ground_truth_labels = [ ka for ka in knowledge_areas if ka.title[ka.title.find('('):] in labels ] return ground_truth_labels
def ground_truth_knowledge_areas(course): """ Look up the set of ground truth manually annotated knowledge areas for a course. If none are found, simply return an empty list. """ from trajectory.models import University, Department, Course from trajectory.models.meta import session from trajectory import config as TRJ import json # Handle an empty course request. if course is None: return set([]) # Attempt to retrieve list of KAs from the ground truth set. try: university = TRJ.KA_TRUTH.get( course.department.university.abbreviation) department = university.get(course.department.abbreviation) labels = department.get(str(course.number)) except: return set([]) if labels is None: return set([]) # Kind of a hack, but guarantees the entered abbreviations will be # unique to a knowledge area. labels = ["(%s)" % label for label in labels] # Query database for knowledge areas. knowledge_areas = session.query(Department).join(University)\ .filter(University.abbreviation=="ACM")\ .filter(Department.abbreviation=="KA")\ .first() # Handle case where ACM/KA is not present in the database. if knowledge_areas is None: raise RuntimeError("Knowledge areas not defined.") # This is the list of course objects representing knowledge areas. knowledge_areas = knowledge_areas.courses ground_truth_labels = [ ka for ka in knowledge_areas if ka.title[ka.title.find('('):] in labels ] return ground_truth_labels
def weighted_topic_vector(course, result_set): topic_vector = [] global_topics = session.query(Topic).filter(Topic.result_set==result_set).all() course_topics = [ta.topic for ta in course.topics if ta.topic.result_set==result_set] course_topic_assocs = [ta for ta in course.topics if ta.topic.result_set==result_set] for topic in global_topics: if topic not in course_topics: topic_vector.append(0) else: topic_index = course_topics.index(topic) proportion = course_topic_assocs[topic_index].proportion topic_vector.append(proportion) return topic_vector
def get_prereq_tree(course_id, parents=set()): """ Recursively identify the prerequisite chain of a course. This tree is rooted at the requested parent course and is structured as a tuple of tuples. Ex: (a [ (b, [ ]) prereq of a (c, [ prereq of a (d, []) prereq of c (e, []) prereq of c ]) ]) """ from trajectory.models import Course from trajectory.models.meta import session # Attempt to identify the parent course. course = session.query(Course).get(course_id) if course is None: return None # Recursive depth base case. if course_id in parents: return None else: parents = parents | {course_id} # Base case. if len(course.prerequisites) == 0: return (course.id, []) # Recursive call. builder = [] for prerequisite in course.prerequisites: sub_prereqs = get_prereq_tree(prerequisite.id, parents) if sub_prereqs is not None: builder.append(sub_prereqs) # Add recursively determined list. return (course.id, builder)
def predicted_knowledge_areas(course, result_set=None): """ Compute the set of ACM knowledge areas assigned to this course by determining conceptual overlap between the target course and the knowledge areas' inferred topics. """ from trajectory.utils.vector import topic_list from trajectory.models import University, Department, Course from trajectory.models.meta import session # Handle empty case. if course is None: return [] # Query database for knowledge areas. knowledge_areas = session.query(Department).join(University)\ .filter(University.abbreviation=="ACM")\ .filter(Department.abbreviation=="KA")\ .first() # Handle case where ACM/KA is not present in the database. if knowledge_areas is None: raise RuntimeError("Knowledge areas not defined.") # This is the list of course objects representing knowledge areas. knowledge_areas = knowledge_areas.courses knowledge_areas_by_topic = { ka: set(topic_list(ka, result_set=result_set)) for ka in knowledge_areas } course_topics = set(topic_list(course, result_set=result_set)) # Generate the list of knowledge areas with conceptual overlap. inferred_knowledge_areas = set([ ka for ka in knowledge_areas_by_topic if (course_topics & knowledge_areas_by_topic[ka]) ]) return inferred_knowledge_areas
def predicted_knowledge_areas(course, result_set=None): """ Compute the set of ACM knowledge areas assigned to this course by determining conceptual overlap between the target course and the knowledge areas' inferred topics. """ from trajectory.utils.vector import topic_list from trajectory.models import University, Department, Course from trajectory.models.meta import session # Handle empty case. if course is None: return [] # Query database for knowledge areas. knowledge_areas = session.query(Department).join(University)\ .filter(University.abbreviation=="ACM")\ .filter(Department.abbreviation=="KA")\ .first() # Handle case where ACM/KA is not present in the database. if knowledge_areas is None: raise RuntimeError("Knowledge areas not defined.") # This is the list of course objects representing knowledge areas. knowledge_areas = knowledge_areas.courses knowledge_areas_by_topic = { ka:set(topic_list(ka, result_set=result_set)) for ka in knowledge_areas } course_topics = set(topic_list(course, result_set=result_set)) # Generate the list of knowledge areas with conceptual overlap. inferred_knowledge_areas = set([ ka for ka in knowledge_areas_by_topic if (course_topics & knowledge_areas_by_topic[ka]) ]) return inferred_knowledge_areas
def get_prereq_graph(course_id, format=None): """ Generate a graph of prerequisites within a course. If format is not requested, simply return a NetworkX graph object. couse_id: the ID of the requested course format: what format to return in (optional) node: json formatted as node-link style adjacency: json formatted as adjacency style tree: json formatted as tree style """ from trajectory.models import Department, Course from trajectory.models.meta import session from trajectory.utils.common import row2dict from networkx.readwrite import json_graph import networkx as nx import json if format not in [None, "node", "adjacency", "tree"]: raise RuntimeError("Unknown requested data format %s" % format) # Initialize a new NetworkX graph. G = nx.DiGraph() # Attempt to look up the requested course. course = session.query(Course).get(course_id) if course is None: return None # Recursively add course ids in a subtree to the graph. def add_tree(G, tree, parent=None): cid = tree[0] # unpack information prereqs = tree[1] # unpack information course = session.query(Course).get(cid) # Insert all known data, including department abbreviation. node_data = row2dict(course) node_data['dept'] = course.department.abbreviation # Identify the primary course in the graph (the requested). if str(cid) == str(course_id): node_data['prime'] = True else: node_data['prime'] = False # If the course has already been added, generate a unique ID for it # based on its parent, and add it anyway. But don't recurse into # its list of prereqs. seen = False if cid in G.nodes(): cid = str(parent) + "-" + str(cid) seen = True # Add course and an edge from its parent, if relevant. G.add_node(cid, node_data) if parent is not None: G.add_edge(parent, cid) # Recurse through the prerequisite tree and add in subtrees. if not seen: for prereq in prereqs: add_tree(G, prereq, cid) # Navigate the prerequisite tree and add the course ids as nodes, and # prerequisite relationships as unweighted edges. prereq_tree = get_prereq_tree(course_id) add_tree(G, prereq_tree) if G is None: return G # Calculate and apply a basic layout. pos = nx.spring_layout(G) for node in G.nodes(): G.node[node]["viz"] = { 'position': { 'x': pos[node][0], 'y': pos[node][1] } } # Apply any requested data output formatting. if format == "node": return json.dumps(json_graph.node_link_data(G)) elif format == "adjacency": return json.dumps(json_graph.adjacency_data(G)) elif format == "tree": return json.dumps(json_graph.tree_data(G, int(course_id))) else: return G
def scrape(args): """ Scrape the available syllabi from the Stanford CS page into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping Stanford CS data.") # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} # Static connection information catalog_url = "https://explorecourses.stanford.edu/search?q=CS&view=catalog&filter-departmentcode-CS=on&filter-term-Spring=on&filter-coursestatus-Active=on&page=" catalog_page = 0 catalog_page_limit = 8 headers = { "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36\ (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" } # Loop forever until we get to the last page and can't "next >" any # more, in which case we stop. while True: # There are currently only 8 pages, so break after we see that # many. if catalog_page == catalog_page_limit: break # Generate a BeautifulSoup object. response = requests.get(catalog_url + str(catalog_page), headers=headers) soup = BeautifulSoup(response.text) # Identify the course list. course_list = soup.find_all(class_="searchResult") # Identify relevant information for each course. for course in course_list: # Generate metadata title = course.find(class_="courseTitle").text identifier = re.compile("\s+").split( course.find(class_="courseNumber").text) prefix = identifier[0] cnum = identifier[1][:-1] description = course.find(class_="courseDescription").text log.debug(identifier) # Identify prerequisites or corequisites. # TODO: Match these up with their database entries. prereq_index = description.find("Prerequisite") if prereq_index > -1: prereq_string = description[prereq_index:] description = description[:prereq_index] # Clean the description string description_raw = description description = clean(description) if description is None: continue # Generate the appropriate course object. departments[prefix.lower()].courses.append(Course( number=cnum, title=title, description_raw=description_raw, description=description)) # Go to the next page. catalog_page = catalog_page + 1 log.info( "Completed scraping." )
def scrape(args): """ Scrape the available syllabi from the PDX CS page into a local directory. """ import logging log = logging.getLogger("root") log.info( "Scraping PDX CS data." ) # Construct a soup of the index. course_index_url = "http://www.pdx.edu/computer-science/courses" course_index = requests.get(course_index_url) soup = BeautifulSoup(course_index.text) # Identify the list of courses. course_list = soup.find("h3", text="Course List")\ .find_next_sibling("ul")\ .find_all("a") # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} for course in course_list: log.debug(course.text) full_title = re.compile("\s+").split(course.text) prefix = full_title[0] cnum = re.sub('[/]', '-', full_title[1]) title = ' '.join(full_title[2:]) if args.cs and prefix.lower() not in ["cs"]: continue try: course_url = course['href'] course_soup = BeautifulSoup(requests.get(course_url).text) except: log.warn("Unable to parse course page.") log.warn(course_url) continue # Find the course description based on its neighbour cdesc_re = re.compile(".*Course Description.*") cdesc = course_soup.find("table").find(text=cdesc_re) if not cdesc.next_sibling: cdesc = cdesc.find_parent("td") # If there's no course description found, well, forget it try: description = cdesc.find_next_sibling("td").text except: log.warn("No course description available.") log.warn(course_url) continue # Clean the description string. description_raw = description description = clean(description) if description is None: continue # Find the course prerequisite list based on its neighbour prereq_re = re.compile(".*Prerequi?sites.*") prereq = course_soup.find("table").find(text=prereq_re) if not prereq.next_sibling: prereq = prereq.find_parent("td") # If there's no prereq list found, leave it as None try: prereq = prereq.find_next_sibling("td").text except: prereq = None # Generate the appropriate course object. departments[prefix.lower()].courses.append(Course( number=cnum, title=title, description_raw=description_raw, description=description)) log.info( "Completed scraping." )
def scrape(args): """ Scrape the available syllabi from the RPI CS page into a local directory. """ import logging log = logging.getLogger("root") log.info( "Scraping RPI CS data." ) # Generate a BeautifulSoup object. catalog_index_url = "http://catalog.rpi.edu/content.php?filter[27]=CSCI&cur_cat_oid=13&navoid=313" catalog_index = requests.get( catalog_index_url ) soup = BeautifulSoup( catalog_index.text ) # Identify the list of courses. course_list = soup.find_all( name="a", onclick=re.compile("showCourse.*")) # Select only the catoid and coid. catoid_re = re.compile("(?<=catoid=)\d+") coid_re = re.compile("(?<=coid=)\d+") # Piecewise course page url. course_url = "http://catalog.rpi.edu/preview_course_nopop.php?catoid=%s&coid=%s" # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} # Identify relevant information for each course. prereqs = {} for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) prefix = full_title[0] cnum = full_title[1] title = ' '.join(full_title[3:]) title = title.replace("'", "") # Identify coid to get description. href = course['href'] catoid = catoid_re.search(href).group(0) coid = coid_re.search(href).group(0) # Generate a BeautifulSoup object of the course description. course_page = requests.get( course_url % (catoid, coid) ) course_soup = BeautifulSoup( course_page.text ) content = course_soup.find(class_="block_content").hr.text # Clean up the description. description = content try: description = description[:description.index("Credit Hours")] description = description[:description.index("When Offered")] except: pass # Identify prerequisites # TODO: Match these up with their database entries. prereq_index = description.find("Prerequisit") if prereq_index > -1: prereq_string = description[prereq_index:] description = description[:prereq_index] prereq_re = re.compile("\w{2,4}\s\d{3}") matches = re.findall(prereq_re, prereq_string) if len(matches) > 0: prereqs["%s %s" % (prefix, cnum)] = matches # Clean the description string description_raw = description description = clean(description) if description is None: continue # Generate the appropriate course object. departments[prefix.lower()].courses.append(Course( number=cnum, title=title, description_raw=description_raw, description=description)) log.info( "Completed scraping." )
def scrape(args): """ Scrape the available syllabi from the UTK catalog into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping UTK data.") # Constant values. catalog_index_url = "http://catalog.utk.edu/preview_course_incoming.php?prefix=%s&catoid=18" course_url = "http://catalog.utk.edu/preview_course.php?catoid=%s&coid=%s" # Regex to select only the catid and coid. catid_re = re.compile("(^[^']*)|(, this.*$)|(')") # Regex to isolate prerequisite course titles. prereq_re = re.compile("(\s|\\\\xa0)(\d{3})") # List of prefixes from the META object. prefixes = [ department.get("abbreviation").lower() for department in META.get("departments") ] # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} prereq_dict = {} # Dictionary of Course : Prereq match list for prefix in prefixes: if args.cs and prefix not in ["cosc"]: continue catalog_index = requests.get(catalog_index_url % prefix) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all(name="a", onclick=re.compile("showCourse.*")) # Identify relevant information for each course. for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) cnum = full_title[1] title = ' '.join(full_title[3:]) # Identify coid to get description. onclick = course['onclick'] (catid, coid) = re.sub(catid_re, "", onclick).split(", ") # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.find(class_="block_content_popup").hr # Clean out garbage [div.extract() for div in content.find_all("div")] extra = ' '.join( [em.extract().text for em in content.find_all("em")]) extra.strip(' ') # Clean up the description description_raw = content.text.replace('\n', '').strip(' ') description = clean(description_raw) if description is None: continue # Identify prerequisites prereq_index = extra.find("requisite") prereq_list = None if prereq_index > -1: prereq_string = extra[prereq_index:] matches = prereq_re.findall(prereq_string) if len(matches) > 0: # Split them up as a dict and store them in a list. # Naiive assumption that every course number is within # the same department because the department titles are # written out and not matchable. prereq_list = [ { "d": prefix.lower(), # department "n": match[-1] # number } for match in matches ] # Generate the appropriate course object. new_course = Course(number=cnum, title=title, description=description, description_raw=description_raw) departments[prefix.lower()].courses.append(new_course) # Add in the requested list of prereqs if found. if prereq_list is not None: prereq_dict[new_course] = prereq_list # Iterate over the list of courses, now that they've been created, and # process their list of requested prerequisites. for course, prereq_list in prereq_dict.items(): # Skip any courses with a 0-length requested prereq list. if len(prereq_list) == 0: continue log.debug(course) log.debug(prereq_list) # Loop over set of prereqs, if there are multiple. department_stack = [] for prereq in prereq_list: n = prereq.get("n") # prereq course number d = prereq.get("d") # prereq course department abbreviation log.debug("Searching for: %s %s" % (d, n)) # Reference the prerequisite course identified by this # department abbreviation and course number. prereq_course = session.query(Course) \ .join(Department) \ .filter(Department.university==university) \ .filter(func.lower(Department.abbreviation)==d.lower()) \ .filter(Course.number==int(n)) \ .first() # If a valid course was found, tack it onto the prereq list of # the requesting course (course). if prereq_course and prereq_course is not course: course.prerequisites.append(prereq_course) else: log.debug("Failed to find course matching '%s %s'." % (d, n)) log.info("Completed scraping.")
def scrape(args): """ Scrape the available syllabi from the SC CS page into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping SC CS data.") # Generate a BeautifulSoup object. catalog_index_url = "http://bulletin.sc.edu/content.php?catoid=36&navoid=4242&filter[27]=CSCE" catalog_index = requests.get(catalog_index_url) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all( name="a", onclick=re.compile("showCourse.*")) # Select only the catoid and coid. catoid_re = re.compile("(?<=catoid=)\d+") coid_re = re.compile("(?<=coid=)\d+") # Piecewise course page url. course_url = "http://bulletin.sc.edu/preview_course.php?catoid=%s&coid=%s" # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} # Identify relevant information for each course. prereqs = {} for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) prefix = full_title[0] cnum = full_title[1] title = ' '.join(full_title[3:]) title = title.replace("'", "") # Identify coid to get description. href = course['href'] catoid = catoid_re.search(href).group(0) coid = coid_re.search(href).group(0) # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catoid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.h1.next_sibling.next_sibling.text # Clean up the description. def strip_substring(body, substring): try: return body[:body.index(substring)] except: return body # Clean the content string content = strip_substring(content, "Print-Friendly Page Close Window") # Clean the description string description_raw = content description_raw = strip_substring(description_raw, "Prereq") description_raw = strip_substring(description_raw, "Coreq") description_raw = strip_substring(description_raw, "Note:") description_raw = strip_substring(description_raw, "Cross-listed") description = clean(description_raw) if description is None: continue # Generate the appropriate course object. departments[prefix.lower()].courses.append(Course( number=cnum, title=title, description_raw=description_raw, description=description)) log.info( "Completed scraping." )
from trajectory.utils.knowledge_areas import ground_truth_knowledge_areas from trajectory.utils.knowledge_areas import predicted_knowledge_areas from trajectory.utils.vector import jaccard from trajectory.models.meta import session from trajectory.models import Course, Department, University, Topic, ResultSet import numpy, csv print("Begin.") print("Read from database...") gmu_cs = session.query(Department).filter(Department.abbreviation=="CS")\ .join(University).filter(University.abbreviation=="GMU")\ .first() gmu = gmu_cs.university result_sets = session.query(ResultSet).all() print("Done.") print("Compute predicted and truth knowledge areas...") knowledge_areas = [ { 'predicted': { course.id: predicted_knowledge_areas(course, rs) for course in gmu_cs.courses }, 'truth': { course.id: ground_truth_knowledge_areas(course) for course in gmu_cs.courses }, } for rs in result_sets ]
def scrape(args): """ Scrape the available syllabi from the LSU CS page into a local directory. """ import logging log = logging.getLogger("root") log.info( "Scraping LSU CS data." ) # Generate a BeautifulSoup object. catalog_index_url = "http://catalog.lsu.edu/content.php?filter[27]=CSC&cur_cat_oid=6&navoid=538" catalog_index = requests.get( catalog_index_url ) soup = BeautifulSoup( catalog_index.text ) # Identify the list of courses. course_list = soup.find_all( name="a", onclick=re.compile("showCourse.*")) # Select only the catoid and coid. catoid_re = re.compile("(?<=catoid=)\d+") coid_re = re.compile("(?<=coid=)\d+") # Piecewise course page url. course_url = "http://catalog.lsu.edu/preview_course_nopop.php?catoid=%s&coid=%s" # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} # Identify relevant information for each course. prereqs = {} for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) prefix = full_title[0] cnum = full_title[1] title = ' '.join(full_title[2:-1]) title = title.replace("'", "") # Identify coid to get description. href = course['href'] catoid = catoid_re.search(href).group(0) coid = coid_re.search(href).group(0) # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catoid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.find(class_="block_content").hr # Remove the metadata. [em.decompose() for em in content.find_all("em") if True] # Clean the description string description_raw = content.text description = clean(content.text) if description is None: continue # Generate the appropriate course object. departments[prefix.lower()].courses.append(Course( number=cnum, title=title, description_raw=description_raw, description=description)) log.info( "Completed scraping." )
def scrape(args): """ Scrape the available syllabi from the RPI CS page into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping RPI CS data.") # Generate a BeautifulSoup object. catalog_index_url = "http://catalog.rpi.edu/content.php?filter[27]=CSCI&cur_cat_oid=13&navoid=313" catalog_index = requests.get(catalog_index_url) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all(name="a", onclick=re.compile("showCourse.*")) # Select only the catoid and coid. catoid_re = re.compile("(?<=catoid=)\d+") coid_re = re.compile("(?<=coid=)\d+") # Piecewise course page url. course_url = "http://catalog.rpi.edu/preview_course_nopop.php?catoid=%s&coid=%s" # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} # Identify relevant information for each course. prereqs = {} for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) prefix = full_title[0] cnum = full_title[1] title = ' '.join(full_title[3:]) title = title.replace("'", "") # Identify coid to get description. href = course['href'] catoid = catoid_re.search(href).group(0) coid = coid_re.search(href).group(0) # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catoid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.find(class_="block_content").hr.text # Clean up the description. description = content try: description = description[:description.index("Credit Hours")] description = description[:description.index("When Offered")] except: pass # Identify prerequisites # TODO: Match these up with their database entries. prereq_index = description.find("Prerequisit") if prereq_index > -1: prereq_string = description[prereq_index:] description = description[:prereq_index] prereq_re = re.compile("\w{2,4}\s\d{3}") matches = re.findall(prereq_re, prereq_string) if len(matches) > 0: prereqs["%s %s" % (prefix, cnum)] = matches # Clean the description string description_raw = description description = clean(description) if description is None: continue # Generate the appropriate course object. departments[prefix.lower()].courses.append( Course(number=cnum, title=title, description_raw=description_raw, description=description)) log.info("Completed scraping.")
def scrape(args): """ Scrape the available data from the AU catalog into a database. """ import logging log = logging.getLogger("root") log.info("Scraping American University data.") # Constant values. catalog_index_url = "http://catalog.american.edu/preview_course_incoming.php?prefix=%s&catoid=3" course_url = "http://catalog.american.edu/preview_course.php?catoid=%s&coid=%s" # Regex to select only the catid and coid. catid_re = re.compile("(^[^']*)|(, this.*$)|(')") # Regex to isolate prerequisite course titles. prereq_re = re.compile("([A-Z]{3,4})(-)(\d{3})") # List of prefixes from the META object. prefixes = [department.get("abbreviation").lower() for department in META.get("departments")] # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University).filter(University.name == university).first() departments = { department.abbreviation.lower(): department for department in session.query(Department).filter(Department.university == university).all() } prereq_dict = {} # Dictionary of Course : Prereq match list for prefix in prefixes: if args.cs and prefix not in ["csc"]: continue catalog_index = requests.get(catalog_index_url % prefix) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all(name="a", onclick=re.compile("showCourse.*")) # Identify relevant information for each course. for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) cnum = full_title[0].split("-")[1] title = " ".join(full_title[1:]) # Identify coid to get description. onclick = course["onclick"] (catid, coid) = re.sub(catid_re, "", onclick).split(", ") # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.find(class_="block_content_popup").hr.br # Remove garbage [div.extract() for div in content.find_all("div")] # Grab extra data extra = "" em = content.find("em") while em: if isinstance(em, NavigableString): extra += em else: extra += em.text rem = em em = em.next_sibling rem.extract() extra.replace("\n", "").replace("\xa0", "").strip(" ") # Clean up the description description_raw = content.text.replace("\n", "").replace("\xa0", "").strip(" ") description = clean(description_raw) if description is None: continue # Identify prerequisites prereq_index = extra.find("requis") prereq_list = None if prereq_index > -1: matches = prereq_re.findall(extra[prereq_index:]) if len(matches) > 0: # Split them up as a dict and store them in a list. prereq_list = [{"d": match[0], "n": match[-1]} for match in matches] # department # number # Generate the appropriate course object. new_course = Course(number=cnum, title=title, description=description, description_raw=description_raw) departments[prefix.lower()].courses.append(new_course) # Add in the requested list of prereqs if found. if prereq_list is not None: prereq_dict[new_course] = prereq_list # Iterate over the list of courses, now that they've been created, and # process their list of requested prerequisites. for course, prereq_list in prereq_dict.items(): # Skip any courses with a 0-length requested prereq list. if len(prereq_list) == 0: continue log.debug(course) log.debug(prereq_list) # Loop over set of prereqs, if there are multiple. department_stack = [] for prereq in prereq_list: n = prereq.get("n") # prereq course number d = prereq.get("d") # prereq course department abbreviation if d.startswith("OR ") or d.startswith("ND "): d = d[3:] # If this is a referential prereq, look up the last course # observed and hope it's the correct department prefix. try: if d in ["and", "or", ","]: d = department_stack[-1] department_stack.append(d) except IndexError: # no previous courses continue log.debug("Searching for: %s %s" % (d, n)) # Reference the prerequisite course identified by this # department abbreviation and course number. prereq_course = ( session.query(Course) .join(Department) .filter(Department.university == university) .filter(func.lower(Department.abbreviation) == d.lower()) .filter(Course.number == int(n)) .first() ) # If a valid course was found, tack it onto the prereq list of # the requesting course (course). if prereq_course and prereq_course is not course: course.prerequisites.append(prereq_course) else: log.debug("Failed to find course matching '%s %s'." % (d, n)) log.info("Completed scraping.")
def scrape(args): """ Scrape the available syllabi from the Utah catalog into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping Utah data.") # Constant values. catalog_index_url = "http://catalog.utah.edu/preview_course_incoming.php?prefix=%s&catoid=6" course_url = "http://catalog.utah.edu/preview_course.php?catoid=%s&coid=%s" # Regex to select only the catid and coid. catid_re = re.compile("(^[^']*)|(, this.*$)|(')") # Regex to isolate prerequisite course titles. prereq_re = re.compile( "(([A-Z]{2,5})|([A-Z]{2}(\s|\\\\xa0)[A-Z]{2})|(H(\s|\\\\xa0)[A-Z]{3}))(\s|\\\\xa0)(\d{4})" ) # List of prefixes from the META object. prefixes = [ department.get("abbreviation").lower() for department in META.get("departments") ] # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} prereq_dict = {} # Dictionary of Course : Prereq match list for prefix in prefixes: if args.cs and prefix not in ["cs"]: continue catalog_index = requests.get(catalog_index_url % prefix) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all(name="a", onclick=re.compile("showCourse.*")) # Identify relevant information for each course. for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) cnum = full_title[1] title = ' '.join(full_title[3:]) # Identify coid to get description. onclick = course['onclick'] (catid, coid) = re.sub(catid_re, "", onclick).split(", ") # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.find(class_="block_content_popup").hr # Identify all the course heading data. strongs = content.find_all("strong") headings = {} tag = content for strong in strongs: tag = strong.next_sibling text = "" while True: if tag.name == "br": break if isinstance(tag, NavigableString): text += tag else: text += tag.text tag = tag.next_sibling headings[strong.text] = text # Update the cursor to post-heading data. content = tag # Remove the footer links [a.extract() for a in tag.find_all("a")] # Clean up the description description_raw = content.text.replace('\n', '') description = clean(description_raw) if description is None: continue # Identify prerequisites prereq_string = headings.get("Enrollment Requirement:") prereq_list = None if prereq_string is not None: matches = prereq_re.findall(prereq_string) if len(matches) > 0: # Split them up as a dict and store them in a list. prereq_list = [ { "d": match[0], # department "n": match[-1] # number } for match in matches ] # Generate the appropriate course object. new_course = Course(number=cnum, title=title, description=description, description_raw=description_raw) departments[prefix.lower()].courses.append(new_course) # Add in the requested list of prereqs if found. if prereq_list is not None: prereq_dict[new_course] = prereq_list # Iterate over the list of courses, now that they've been created, and # process their list of requested prerequisites. for course, prereq_list in prereq_dict.items(): # Skip any courses with a 0-length requested prereq list. if len(prereq_list) == 0: continue log.debug(course) log.debug(prereq_list) # Loop over set of prereqs, if there are multiple. department_stack = [] for prereq in prereq_list: n = prereq.get("n") # prereq course number d = prereq.get("d") # prereq course department abbreviation if d.startswith("OR ") or d.startswith("ND "): d = d[3:] # If this is a referential prereq, look up the last course # observed and hope it's the correct department prefix. try: if d in ["and", "or", ","]: d = department_stack[-1] department_stack.append(d) except IndexError: # no previous courses continue log.debug("Searching for: %s %s" % (d, n)) # Reference the prerequisite course identified by this # department abbreviation and course number. prereq_course = session.query(Course) \ .join(Department) \ .filter(Department.university==university) \ .filter(func.lower(Department.abbreviation)==d.lower()) \ .filter(Course.number==int(n)) \ .first() # If a valid course was found, tack it onto the prereq list of # the requesting course (course). if prereq_course and prereq_course is not course: course.prerequisites.append(prereq_course) else: log.debug("Failed to find course matching '%s %s'." % (d, n)) log.info("Completed scraping.")
def scrape(args): """ Scrape the available syllabi from the KSU CS page into a local directory. """ import logging log = logging.getLogger("root") log.info( "Scraping KSU data." ) # Constant values. catalog_index_url = "http://catalog.k-state.edu/content.php?filter[27]=%s&cur_cat_oid=13&navoid=1425" course_url = "http://catalog.k-state.edu/preview_course_nopop.php?catoid=%s&coid=%s" # Scraper regexes. catoid_re = re.compile("(?<=catoid=)\d+") coid_re = re.compile("(?<=coid=)\d+") prereq_re = re.compile("([A-Za-z,]{2,5})(\s|\\\\xa0)(\d{3})") # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} prereq_dict = {} # Dictionary of Course : Prereq match list for prefix in departments.keys(): if args.cs and prefix not in ["cis"]: continue catalog_index = requests.get(catalog_index_url % prefix) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all( name="a", onclick=re.compile("showCourse.*")) # Identify relevant information for each course. for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) #prefix = full_title[0] cnum = full_title[1] title = ' '.join(full_title[3:]) title = title.replace("'", "") # Identify coid to get description. href = course['href'] catoid = catoid_re.search(href).group(0) coid = coid_re.search(href).group(0) # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catoid, coid)) course_soup = BeautifulSoup(course_page.text) # Identify the components of the description and its metadata. result_set = course_soup.find(class_="block_content") \ .table \ .find_next_sibling("p") \ .h1 \ .find_next_siblings() # Join them together as text. content = ' '.join([r.text for r in result_set[1:]]) # Clean up the description. def strip_substring(body, substring): try: return body[:body.index(substring)] except: return body description = content description = strip_substring(description, "Note") description = strip_substring(description, "Requisites") description = strip_substring(description, "When Offered") description = strip_substring(description, "UGE course") # Identify prerequisites. prereq_index = content.find("Requisites") prereq_list = None if prereq_index > -1: # Grab the substring of prereqs and find matches. prereq_string = content[prereq_index:] prereq_string = strip_substring(prereq_string, "Note") prereq_string = strip_substring(prereq_string, "When Offered") matches = prereq_re.findall(prereq_string) if len(matches) > 0: # Split them up as a dict and store them in a list. prereq_list = [{ "d": match[0], # department "n": match[2] # number } for match in matches] # Clean the description string description_raw = description description = clean(description_raw) if description is None: continue # Generate the appropriate course object. new_course = Course( number=cnum, title=title, description=description, description_raw=description_raw) departments[prefix].courses.append(new_course) # Add in the requested list of prereqs if found. if prereq_list is not None: prereq_dict[new_course] = prereq_list # Iterate over the list of courses, now that they've been created, and # process their list of requested prerequisites. for course, prereq_list in prereq_dict.items(): # Skip any courses with a 0-length requested prereq list. if len(prereq_list) == 0: continue log.debug(course) log.debug(prereq_list) # Loop over set of prereqs, if there are multiple. department_stack = [] for prereq in prereq_list: n = prereq.get("n") # prereq course number d = prereq.get("d") # prereq course department abbreviation # If this is a referential prereq, look up the last course # observed and hope it's the correct department prefix. try: if d in ["and", "or", ","]: d = department_stack[-1] department_stack.append(d) except IndexError: # no previous courses continue log.debug("Searching for: %s %s" % (d, n)) # Reference the prerequisite course identified by this # department abbreviation and course number. prereq_course = session.query(Course) \ .join(Department) \ .filter(Department.university==university) \ .filter(func.lower(Department.abbreviation)==d.lower()) \ .filter(Course.number==int(n)) \ .first() # If a valid course was found, tack it onto the prereq list of # the requesting course (course). if prereq_course and prereq_course is not course: course.prerequisites.append(prereq_course) else: log.debug("Failed to find course matching '%s %s'." % (d, n)) log.info("Completed scraping.")
if topic not in course_topics: topic_vector.append(0) else: topic_index = course_topics.index(topic) proportion = course_topic_assocs[topic_index].proportion topic_vector.append(proportion) return topic_vector def prerequisite_distances(course, result_set): course_vector = numpy.array(weighted_topic_vector(course, result_set)) prerequisite_vectors = [numpy.array(weighted_topic_vector(prereq, result_set)) for prereq in course.prerequisites] return [1 - numpy.linalg.norm(course_vector - prereq_vector) for prereq_vector in prerequisite_vectors] def university_prerequisite_statistics(abbreviation, result_set): uni_courses = session.query(Course).join(Department).join(University).filter(University.abbreviation==abbreviation).all() prereq_distances = [prerequisite_distances(course, result_set) for course in uni_courses] prereq_distances = [p for p in prereq_distances if p] # strip courses with no prerequisites mean = numpy.mean(list(chain.from_iterable(prereq_distances))) stdv = numpy.std(list(chain.from_iterable(prereq_distances))) return (mean, stdv) rs51 = session.query(ResultSet).get(51) result_sets = session.query(ResultSet).all() #universities = session.query(University).all() #for uni in universities: # (mean, stdv) = university_prerequisite_statistics(uni.abbreviation, rs51) # print("%s Mean: %0.3f Std: %0.3f" % (uni.abbreviation, mean, stdv))
def scrape(args): """ Scrape the available syllabi from the GMU CS page into a local directory. """ import logging log = logging.getLogger("root") log.info( "Scraping GMU data." ) # Constant values. catalog_index_url = "http://catalog.gmu.edu/preview_course_incoming.php?cattype=combined&prefix=%s" course_url = "http://catalog.gmu.edu/preview_course.php?catoid=%s&coid=%s" # Regex to select only the catid and coid. catid_re = re.compile("(^[^']*)|(, this.*$)|(')") # Regex to isolate prerequisite course titles. prereq_re = re.compile("([A-Za-z,]{2,4})(\s|\\\\xa0)(\d{3})") # List of prefixes from the META object. prefixes = [department.get("abbreviation").lower() for department in META.get("departments")] # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} prereq_dict = {} # Dictionary of Course : Prereq match list for prefix in prefixes: if args.cs and prefix not in ["cs", "ait"]: continue catalog_index = requests.get(catalog_index_url % prefix) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all( name="a", onclick=re.compile("showCourse.*")) # Identify relevant information for each course. for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) cnum = full_title[1] title = ' '.join(full_title[3:]) # Identify coid to get description. onclick = course['onclick'] (catid, coid) = re.sub(catid_re, "", onclick).split(", ") # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.find(class_="block_content_popup").hr.text # Clean up the description. description = content try: description = description[:description.index("Hours of Lecture")] except: pass # Identify prerequisites prereq_index = description.find("Prerequisite(s)") prereq_list = None if prereq_index > -1: # Grab the substring of prereqs and find matches. notes_index = description.find("Notes") prereq_string = description[prereq_index:notes_index] description = description[:prereq_index] matches = prereq_re.findall(prereq_string) if len(matches) > 0: # Split them up as a dict and store them in a list. prereq_list = [{ "d": match[0], # department "n": match[2] # number } for match in matches] # Clean the description string description_raw = description description = clean(description) if description is None: continue # Generate the appropriate course object. new_course = Course( number=cnum, title=title, description=description, description_raw=description_raw) departments[prefix.lower()].courses.append(new_course) # Add in the requested list of prereqs if found. if prereq_list is not None: prereq_dict[new_course] = prereq_list # Iterate over the list of courses, now that they've been created, and # process their list of requested prerequisites. for course, prereq_list in prereq_dict.items(): # Skip any courses with a 0-length requested prereq list. if len(prereq_list) == 0: continue log.debug(course) log.debug(prereq_list) # Loop over set of prereqs, if there are multiple. department_stack = [] for prereq in prereq_list: n = prereq.get("n") # prereq course number d = prereq.get("d") # prereq course department abbreviation # If this is a referential prereq, look up the last course # observed and hope it's the correct department prefix. try: if d in ["and", "or", ","]: d = department_stack[-1] department_stack.append(d) except IndexError: # no previous courses continue log.debug("Searching for: %s %s" % (d, n)) # Reference the prerequisite course identified by this # department abbreviation and course number. prereq_course = session.query(Course) \ .join(Department) \ .filter(Department.university==university) \ .filter(func.lower(Department.abbreviation)==d.lower()) \ .filter(Course.number==int(n)) \ .first() # If a valid course was found, tack it onto the prereq list of # the requesting course (course). if prereq_course and prereq_course is not course: course.prerequisites.append(prereq_course) else: log.debug("Failed to find course matching '%s %s'." % (d, n)) log.info("Completed scraping.")
from trajectory.models.meta import session from trajectory.models import Course, Department, University, ResultSet from trajectory.utils.vector import jaccard, topic_list import numpy result_sets = session.query(ResultSet).all() rs = session.query(ResultSet).get(51) departments = session.query(Department).all() departments = list(filter(lambda department: not ((department.university.abbreviation=="ACM" and department.abbreviation=="KA") or (department.university.abbreviation=="GMU" and department.abbreviation=="AIT")), departments)) def compare(depA, depB, rs): dep_a_topics = topic_list(depA, rs) dep_b_topics = topic_list(depB, rs) return jaccard(dep_a_topics, dep_b_topics) comparisons = {} for depA in departments: print(depA.university.abbreviation) comparisons[depA] = {} for depB in departments: #values = [compare(depA, depB, rs) for rs in result_sets] #value = numpy.mean(values) value = compare(depA, depB, rs) comparisons[depA][depB] = value print("done")
def scrape(args): """ Scrape the available syllabi from the Utah catalog into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping Utah data.") # Constant values. catalog_index_url = "http://catalog.utah.edu/preview_course_incoming.php?prefix=%s&catoid=6" course_url = "http://catalog.utah.edu/preview_course.php?catoid=%s&coid=%s" # Regex to select only the catid and coid. catid_re = re.compile("(^[^']*)|(, this.*$)|(')") # Regex to isolate prerequisite course titles. prereq_re = re.compile("(([A-Z]{2,5})|([A-Z]{2}(\s|\\\\xa0)[A-Z]{2})|(H(\s|\\\\xa0)[A-Z]{3}))(\s|\\\\xa0)(\d{4})") # List of prefixes from the META object. prefixes = [department.get("abbreviation").lower() for department in META.get("departments")] # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} prereq_dict = {} # Dictionary of Course : Prereq match list for prefix in prefixes: if args.cs and prefix not in ["cs"]: continue catalog_index = requests.get(catalog_index_url % prefix) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all( name="a", onclick=re.compile("showCourse.*")) # Identify relevant information for each course. for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) cnum = full_title[1] title = ' '.join(full_title[3:]) # Identify coid to get description. onclick = course['onclick'] (catid, coid) = re.sub(catid_re, "", onclick).split(", ") # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.find(class_="block_content_popup").hr # Identify all the course heading data. strongs = content.find_all("strong") headings = {} tag = content for strong in strongs: tag = strong.next_sibling text = "" while True: if tag.name == "br": break if isinstance(tag, NavigableString): text += tag else: text += tag.text tag = tag.next_sibling headings[strong.text] = text # Update the cursor to post-heading data. content = tag # Remove the footer links [a.extract() for a in tag.find_all("a")] # Clean up the description description_raw = content.text.replace('\n', '') description = clean(description_raw) if description is None: continue # Identify prerequisites prereq_string = headings.get("Enrollment Requirement:") prereq_list = None if prereq_string is not None: matches = prereq_re.findall(prereq_string) if len(matches) > 0: # Split them up as a dict and store them in a list. prereq_list = [{ "d": match[0], # department "n": match[-1] # number } for match in matches] # Generate the appropriate course object. new_course = Course( number=cnum, title=title, description=description, description_raw=description_raw) departments[prefix.lower()].courses.append(new_course) # Add in the requested list of prereqs if found. if prereq_list is not None: prereq_dict[new_course] = prereq_list # Iterate over the list of courses, now that they've been created, and # process their list of requested prerequisites. for course, prereq_list in prereq_dict.items(): # Skip any courses with a 0-length requested prereq list. if len(prereq_list) == 0: continue log.debug(course) log.debug(prereq_list) # Loop over set of prereqs, if there are multiple. department_stack = [] for prereq in prereq_list: n = prereq.get("n") # prereq course number d = prereq.get("d") # prereq course department abbreviation if d.startswith("OR ") or d.startswith("ND "): d = d[3:] # If this is a referential prereq, look up the last course # observed and hope it's the correct department prefix. try: if d in ["and", "or", ","]: d = department_stack[-1] department_stack.append(d) except IndexError: # no previous courses continue log.debug("Searching for: %s %s" % (d, n)) # Reference the prerequisite course identified by this # department abbreviation and course number. prereq_course = session.query(Course) \ .join(Department) \ .filter(Department.university==university) \ .filter(func.lower(Department.abbreviation)==d.lower()) \ .filter(Course.number==int(n)) \ .first() # If a valid course was found, tack it onto the prereq list of # the requesting course (course). if prereq_course and prereq_course is not course: course.prerequisites.append(prereq_course) else: log.debug("Failed to find course matching '%s %s'." % (d, n)) log.info("Completed scraping.")
def scrape(args): """ Routes scraping to the appropriate scraper module. """ from trajectory.models import University, Department, Course from trajectory.models.meta import session import logging import os from importlib import import_module log = logging.getLogger("root") log.info("Selected scraping targets: %s." % args.targets) # Loop over the requested targets and call their scrape function. for target in args.targets: log.info("Engaging scraper engine: %s" % target) # Prepend the target name with a dot for importing. try: target_module = ".%s" % target scraper = import_module( target_module, "trajectory.engines" ) except ImportError: log.warn("No engine named '%s'." % target) continue # Register the target with the database, if not already present. try: metadata = scraper.META university = metadata.get("school") university_query = session.query(University)\ .filter(University.name==university.get("name")) # If the university has already been registered, alert the user # but grab a reference to it for the Departments. if(university_query.count() > 0): university = university_query.first() log.warn("University '%s' already registered." % \ university.name) # If the university has not been registered, register a new # one. else: log.info("Registering university '%s' with database." % \ university.get("name")) university = University( name=university.get("name"), abbreviation=university.get("abbreviation"), url=university.get("url")) # Add the university to the session. session.add(university) # Loop over the departments defined in the metadata. departments = metadata.get("departments") for department in departments: if args.cs and department.get("abbreviation").lower() not in [ "ec", "ka", "cosc", "csce", "ait", "cs", "cis", "csc", "csci" ]: continue department_query = session.query(Department)\ .join(University)\ .filter(Department.name==department.get("name"))\ .filter(Department.university_id==university.id) # If the department has been registered, alert the user. if department_query.count() > 0: log.warn("Department '%s' already registered." % \ department.get("name")) continue # Otherwise register a new one. else: university.departments.append(Department( name=department.get("name"), abbreviation=department.get("abbreviation"), url=department.get("url"))) log.info("Registering department '%s' with database." % \ department.get("name")) except AttributeError as e: log.warn("Target %s metadata not defined." % target) log.warn("Terminating engine.") log.debug(e) continue # Begin downloading course data. try: # Check if there are already courses defined for any # departments within this university. If there are, skip # this target. if session.query(Course).join(Department) \ .filter(Course.department_id==Department.id)\ .filter(Department.university==university)\ .count() > 0: log.warn("Target %s already has courses defined." % target) # Otherwise, go ahead and scrape the course data for this # target. else: scraper.scrape(args) except NotImplementedError as e: log.warn("Target %s has not been defined. Skipping." % target ) log.info("Disengaging scraper engine.")
def export(args): """ Read all data from the database and store it to disk for analysis. """ from trajectory.models import Course, Department, University from trajectory.models.meta import session import os import logging, re log = logging.getLogger("root") log.info("Begin data export.") # Create the base output directory in the user specified output data # directory. try: os.mkdir(args.data_directory) log.debug("Creating folder %s." % args.data_directory) except FileExistsError: log.error("Data directory '%s' already exists." % args.data_directory) return # Get access to the data. universities = session.query(University).all() # Dump data in folders broken down by university. for university in universities: # Create folder to store univerity data. path = os.path.join(args.data_directory, university.abbreviation) os.mkdir(path) log.debug("Creating folder %s." % path) # Retrieve list of known university departments. departments = university.departments # CS department shortcut. if args.cs: args.departments = ["cosc", "csce", "ec", "ka", "ait", "cs", "cis", "csc", "csci"] # Filter by the requested departments. if args.departments: # Lowercase everything for consistency. depts_lowered = list(map(lambda a: a.lower(), args.departments)) departments = [d for d in departments if d.abbreviation.lower() in depts_lowered] # Dump data in folders broken down by department (prefix). for department in departments: # Create folder to store department data. path = os.path.join( args.data_directory, university.abbreviation, department.abbreviation) os.mkdir(path) log.debug("Dumping to folder %s." % path) # Retrieve course list. courses = department.courses # Label a course with its ID in the database to create an # absolute reference to it. label = lambda course: "%d.txt" % course.id # Write course descriptions to files. Include the course id in # order to distinguish multiple entries of the same course with # different titles. for course in courses: course_path = os.path.join(path, label(course)) while os.path.isfile(course_path): course_path = os.path.join(path, label(course)) with open(course_path, "w") as course_file: course_file.write(course.description) log.info("Data export complete.")
def scrape(args): """ Scrape the available syllabi from the KSU CS page into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping KSU data.") # Constant values. catalog_index_url = "http://catalog.k-state.edu/content.php?filter[27]=%s&cur_cat_oid=13&navoid=1425" course_url = "http://catalog.k-state.edu/preview_course_nopop.php?catoid=%s&coid=%s" # Scraper regexes. catoid_re = re.compile("(?<=catoid=)\d+") coid_re = re.compile("(?<=coid=)\d+") prereq_re = re.compile("([A-Za-z,]{2,5})(\s|\\\\xa0)(\d{3})") # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} prereq_dict = {} # Dictionary of Course : Prereq match list for prefix in departments.keys(): if args.cs and prefix not in ["cis"]: continue catalog_index = requests.get(catalog_index_url % prefix) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all(name="a", onclick=re.compile("showCourse.*")) # Identify relevant information for each course. for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) #prefix = full_title[0] cnum = full_title[1] title = ' '.join(full_title[3:]) title = title.replace("'", "") # Identify coid to get description. href = course['href'] catoid = catoid_re.search(href).group(0) coid = coid_re.search(href).group(0) # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catoid, coid)) course_soup = BeautifulSoup(course_page.text) # Identify the components of the description and its metadata. result_set = course_soup.find(class_="block_content") \ .table \ .find_next_sibling("p") \ .h1 \ .find_next_siblings() # Join them together as text. content = ' '.join([r.text for r in result_set[1:]]) # Clean up the description. def strip_substring(body, substring): try: return body[:body.index(substring)] except: return body description = content description = strip_substring(description, "Note") description = strip_substring(description, "Requisites") description = strip_substring(description, "When Offered") description = strip_substring(description, "UGE course") # Identify prerequisites. prereq_index = content.find("Requisites") prereq_list = None if prereq_index > -1: # Grab the substring of prereqs and find matches. prereq_string = content[prereq_index:] prereq_string = strip_substring(prereq_string, "Note") prereq_string = strip_substring(prereq_string, "When Offered") matches = prereq_re.findall(prereq_string) if len(matches) > 0: # Split them up as a dict and store them in a list. prereq_list = [ { "d": match[0], # department "n": match[2] # number } for match in matches ] # Clean the description string description_raw = description description = clean(description_raw) if description is None: continue # Generate the appropriate course object. new_course = Course(number=cnum, title=title, description=description, description_raw=description_raw) departments[prefix].courses.append(new_course) # Add in the requested list of prereqs if found. if prereq_list is not None: prereq_dict[new_course] = prereq_list # Iterate over the list of courses, now that they've been created, and # process their list of requested prerequisites. for course, prereq_list in prereq_dict.items(): # Skip any courses with a 0-length requested prereq list. if len(prereq_list) == 0: continue log.debug(course) log.debug(prereq_list) # Loop over set of prereqs, if there are multiple. department_stack = [] for prereq in prereq_list: n = prereq.get("n") # prereq course number d = prereq.get("d") # prereq course department abbreviation # If this is a referential prereq, look up the last course # observed and hope it's the correct department prefix. try: if d in ["and", "or", ","]: d = department_stack[-1] department_stack.append(d) except IndexError: # no previous courses continue log.debug("Searching for: %s %s" % (d, n)) # Reference the prerequisite course identified by this # department abbreviation and course number. prereq_course = session.query(Course) \ .join(Department) \ .filter(Department.university==university) \ .filter(func.lower(Department.abbreviation)==d.lower()) \ .filter(Course.number==int(n)) \ .first() # If a valid course was found, tack it onto the prereq list of # the requesting course (course). if prereq_course and prereq_course is not course: course.prerequisites.append(prereq_course) else: log.debug("Failed to find course matching '%s %s'." % (d, n)) log.info("Completed scraping.")
def scrape(args): """ Scrape the available syllabi from the Stanford CS page into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping Stanford CS data.") # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University)\ .filter(University.name==university)\ .first() departments = {department.abbreviation.lower() : department for department in session.query(Department)\ .filter(Department.university==university)\ .all()} # Static connection information catalog_url = "https://explorecourses.stanford.edu/search?q=CS&view=catalog&filter-departmentcode-CS=on&filter-term-Spring=on&filter-coursestatus-Active=on&page=" catalog_page = 0 catalog_page_limit = 8 headers = { "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36\ (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" } # Loop forever until we get to the last page and can't "next >" any # more, in which case we stop. while True: # There are currently only 8 pages, so break after we see that # many. if catalog_page == catalog_page_limit: break # Generate a BeautifulSoup object. response = requests.get(catalog_url + str(catalog_page), headers=headers) soup = BeautifulSoup(response.text) # Identify the course list. course_list = soup.find_all(class_="searchResult") # Identify relevant information for each course. for course in course_list: # Generate metadata title = course.find(class_="courseTitle").text identifier = re.compile("\s+").split( course.find(class_="courseNumber").text) prefix = identifier[0] cnum = identifier[1][:-1] description = course.find(class_="courseDescription").text log.debug(identifier) # Identify prerequisites or corequisites. # TODO: Match these up with their database entries. prereq_index = description.find("Prerequisite") if prereq_index > -1: prereq_string = description[prereq_index:] description = description[:prereq_index] # Clean the description string description_raw = description description = clean(description) if description is None: continue # Generate the appropriate course object. departments[prefix.lower()].courses.append( Course(number=cnum, title=title, description_raw=description_raw, description=description)) # Go to the next page. catalog_page = catalog_page + 1 log.info("Completed scraping.")
def scrape(args): """ Scrape the available syllabi from the UTK catalog into a local directory. """ import logging log = logging.getLogger("root") log.info("Scraping UTK data.") # Constant values. catalog_index_url = "http://catalog.utk.edu/preview_course_incoming.php?prefix=%s&catoid=18" course_url = "http://catalog.utk.edu/preview_course.php?catoid=%s&coid=%s" # Regex to select only the catid and coid. catid_re = re.compile("(^[^']*)|(, this.*$)|(')") # Regex to isolate prerequisite course titles. prereq_re = re.compile("(\s|\\\\xa0)(\d{3})") # List of prefixes from the META object. prefixes = [department.get("abbreviation").lower() for department in META.get("departments")] # Fetch existing metadata objects from database. university = META.get("school").get("name") university = session.query(University).filter(University.name == university).first() departments = { department.abbreviation.lower(): department for department in session.query(Department).filter(Department.university == university).all() } prereq_dict = {} # Dictionary of Course : Prereq match list for prefix in prefixes: if args.cs and prefix not in ["cosc"]: continue catalog_index = requests.get(catalog_index_url % prefix) soup = BeautifulSoup(catalog_index.text) # Identify the list of courses. course_list = soup.find_all(name="a", onclick=re.compile("showCourse.*")) # Identify relevant information for each course. for course in course_list: # Generate metadata log.debug(course.text) full_title = re.compile("\s+").split(course.text) cnum = full_title[1] title = " ".join(full_title[3:]) # Identify coid to get description. onclick = course["onclick"] (catid, coid) = re.sub(catid_re, "", onclick).split(", ") # Generate a BeautifulSoup object of the course description. course_page = requests.get(course_url % (catid, coid)) course_soup = BeautifulSoup(course_page.text) content = course_soup.find(class_="block_content_popup").hr # Clean out garbage [div.extract() for div in content.find_all("div")] extra = " ".join([em.extract().text for em in content.find_all("em")]) extra.strip(" ") # Clean up the description description_raw = content.text.replace("\n", "").strip(" ") description = clean(description_raw) if description is None: continue # Identify prerequisites prereq_index = extra.find("requisite") prereq_list = None if prereq_index > -1: prereq_string = extra[prereq_index:] matches = prereq_re.findall(prereq_string) if len(matches) > 0: # Split them up as a dict and store them in a list. # Naiive assumption that every course number is within # the same department because the department titles are # written out and not matchable. prereq_list = [{"d": prefix.lower(), "n": match[-1]} for match in matches] # department # number # Generate the appropriate course object. new_course = Course(number=cnum, title=title, description=description, description_raw=description_raw) departments[prefix.lower()].courses.append(new_course) # Add in the requested list of prereqs if found. if prereq_list is not None: prereq_dict[new_course] = prereq_list # Iterate over the list of courses, now that they've been created, and # process their list of requested prerequisites. for course, prereq_list in prereq_dict.items(): # Skip any courses with a 0-length requested prereq list. if len(prereq_list) == 0: continue log.debug(course) log.debug(prereq_list) # Loop over set of prereqs, if there are multiple. department_stack = [] for prereq in prereq_list: n = prereq.get("n") # prereq course number d = prereq.get("d") # prereq course department abbreviation log.debug("Searching for: %s %s" % (d, n)) # Reference the prerequisite course identified by this # department abbreviation and course number. prereq_course = ( session.query(Course) .join(Department) .filter(Department.university == university) .filter(func.lower(Department.abbreviation) == d.lower()) .filter(Course.number == int(n)) .first() ) # If a valid course was found, tack it onto the prereq list of # the requesting course (course). if prereq_course and prereq_course is not course: course.prerequisites.append(prereq_course) else: log.debug("Failed to find course matching '%s %s'." % (d, n)) log.info("Completed scraping.")