def import_results(args): """ Read topic data generated from the learn module and store it in the database. """ from trajectory.models import Course, Topic, CourseTopicAssociation from trajectory.models import ResultSet from trajectory.models.meta import session from trajectory import config as TRJ import logging, csv log = logging.getLogger("root") log.info("Begin topic import.") # Create a new result set. result_set = ResultSet( alpha=args.alpha, beta=args.beta, iterations=args.iterations ) session.add(result_set) session.commit() # Add in new topic definitions. with open(args.topic_file, "r") as topic_file: topic_reader = csv.reader(topic_file, delimiter=",") next(topic_reader, None) # skip header topic_count = 0 for topic in topic_reader: topic_count += 1 session.add(Topic( id=topic[0], result_set=result_set, words=', '.join(topic[1:]) )) result_set.num_topics = topic_count # Add the topics to their courses. courses = session.query(Course).all() course_query = session.query(Course) course_by_id = lambda c: course_query.get(c) with open(args.course_file, "r") as course_file: course_reader = csv.reader(course_file, delimiter=",") next(course_reader, None) # skip header topics_to_add = { # {course:[[id, weight], [id, weight], ...]} course_by_id(row[1]) : [ topic.split(':') for topic in row[2:] if float(topic.split(':')[1]) > TRJ.TOPIC_MIN_WEIGHT ] for row in course_reader if course_by_id(row[1]) is not None } for course, topic_list in topics_to_add.items(): for (topicid, proportion) in topic_list: association = CourseTopicAssociation(proportion=proportion) association.topic_id = topicid association.result_set_id = result_set.id course.topics.append(association) log.info("Topic import complete.")
def scrape(args): """ Routes scraping to the appropriate scraper module. """ from trajectory.models import University, Department, Course from trajectory.models.meta import session import logging import os from importlib import import_module log = logging.getLogger("root") log.info("Selected scraping targets: %s." % args.targets) # Loop over the requested targets and call their scrape function. for target in args.targets: log.info("Engaging scraper engine: %s" % target) # Prepend the target name with a dot for importing. try: target_module = ".%s" % target scraper = import_module( target_module, "trajectory.engines" ) except ImportError: log.warn("No engine named '%s'." % target) continue # Register the target with the database, if not already present. try: metadata = scraper.META university = metadata.get("school") university_query = session.query(University)\ .filter(University.name==university.get("name")) # If the university has already been registered, alert the user # but grab a reference to it for the Departments. if(university_query.count() > 0): university = university_query.first() log.warn("University '%s' already registered." % \ university.name) # If the university has not been registered, register a new # one. else: log.info("Registering university '%s' with database." % \ university.get("name")) university = University( name=university.get("name"), abbreviation=university.get("abbreviation"), url=university.get("url")) # Add the university to the session. session.add(university) # Loop over the departments defined in the metadata. departments = metadata.get("departments") for department in departments: if args.cs and department.get("abbreviation").lower() not in [ "ec", "ka", "cosc", "csce", "ait", "cs", "cis", "csc", "csci" ]: continue department_query = session.query(Department)\ .join(University)\ .filter(Department.name==department.get("name"))\ .filter(Department.university_id==university.id) # If the department has been registered, alert the user. if department_query.count() > 0: log.warn("Department '%s' already registered." % \ department.get("name")) continue # Otherwise register a new one. else: university.departments.append(Department( name=department.get("name"), abbreviation=department.get("abbreviation"), url=department.get("url"))) log.info("Registering department '%s' with database." % \ department.get("name")) except AttributeError as e: log.warn("Target %s metadata not defined." % target) log.warn("Terminating engine.") log.debug(e) continue # Begin downloading course data. try: # Check if there are already courses defined for any # departments within this university. If there are, skip # this target. if session.query(Course).join(Department) \ .filter(Course.department_id==Department.id)\ .filter(Department.university==university)\ .count() > 0: log.warn("Target %s already has courses defined." % target) # Otherwise, go ahead and scrape the course data for this # target. else: scraper.scrape(args) except NotImplementedError as e: log.warn("Target %s has not been defined. Skipping." % target ) log.info("Disengaging scraper engine.")