예제 #1
0
def import_results(args):
    """
    Read topic data generated from the learn module and store it in the
    database.
    """

    from trajectory.models import Course, Topic, CourseTopicAssociation
    from trajectory.models import ResultSet
    from trajectory.models.meta import session
    from trajectory import config as TRJ
    import logging, csv
    log = logging.getLogger("root")
    log.info("Begin topic import.")

    # Create a new result set.
    result_set = ResultSet(
        alpha=args.alpha,
        beta=args.beta,
        iterations=args.iterations
    )
    session.add(result_set)
    session.commit()

    # Add in new topic definitions.
    with open(args.topic_file, "r") as topic_file:
        topic_reader = csv.reader(topic_file, delimiter=",")
        next(topic_reader, None) # skip header
        topic_count = 0
        for topic in topic_reader:
            topic_count += 1
            session.add(Topic(
                id=topic[0],
                result_set=result_set,
                words=', '.join(topic[1:])
            ))
        result_set.num_topics = topic_count

    # Add the topics to their courses.
    courses = session.query(Course).all()
    course_query = session.query(Course)
    course_by_id = lambda c: course_query.get(c)
    with open(args.course_file, "r") as course_file:
        course_reader = csv.reader(course_file, delimiter=",")
        next(course_reader, None) # skip header
        topics_to_add = { # {course:[[id, weight], [id, weight], ...]}
            course_by_id(row[1]) : [
                topic.split(':') for topic in row[2:]
                    if float(topic.split(':')[1]) > TRJ.TOPIC_MIN_WEIGHT
            ] for row in course_reader if course_by_id(row[1]) is not None
        }
        for course, topic_list in topics_to_add.items():
            for (topicid, proportion) in topic_list:
                association = CourseTopicAssociation(proportion=proportion)
                association.topic_id = topicid
                association.result_set_id = result_set.id
                course.topics.append(association)

    log.info("Topic import complete.")
예제 #2
0
def scrape(args):
    """
    Routes scraping to the appropriate scraper module.
    """

    from trajectory.models import University, Department, Course
    from trajectory.models.meta import session

    import logging
    import os
    from importlib import import_module

    log = logging.getLogger("root")
    log.info("Selected scraping targets: %s." % args.targets)

    # Loop over the requested targets and call their scrape function.
    for target in args.targets:

        log.info("Engaging scraper engine: %s" % target)

        # Prepend the target name with a dot for importing.
        try:
            target_module = ".%s" % target
            scraper = import_module( target_module, "trajectory.engines" )
        except ImportError:
            log.warn("No engine named '%s'." % target)
            continue

        # Register the target with the database, if not already present.
        try:
            metadata = scraper.META

            university = metadata.get("school")
            university_query = session.query(University)\
                    .filter(University.name==university.get("name"))

            # If the university has already been registered, alert the user
            # but grab a reference to it for the Departments.
            if(university_query.count() > 0):
                university = university_query.first()
                log.warn("University '%s' already registered." % \
                        university.name)

            # If the university has not been registered, register a new
            # one.
            else:
                log.info("Registering university '%s' with database." % \
                        university.get("name"))
                university = University(
                        name=university.get("name"),
                        abbreviation=university.get("abbreviation"),
                        url=university.get("url"))

                # Add the university to the session.
                session.add(university)

            # Loop over the departments defined in the metadata.
            departments = metadata.get("departments")
            for department in departments:

                if args.cs and department.get("abbreviation").lower() not in [
                        "ec", "ka", "cosc", "csce", "ait",
                        "cs", "cis", "csc", "csci"
                        ]: continue

                department_query = session.query(Department)\
                        .join(University)\
                        .filter(Department.name==department.get("name"))\
                        .filter(Department.university_id==university.id)

                # If the department has been registered, alert the user.
                if department_query.count() > 0:
                    log.warn("Department '%s' already registered." % \
                            department.get("name"))
                    continue

                # Otherwise register a new one.
                else:
                    university.departments.append(Department(
                            name=department.get("name"),
                            abbreviation=department.get("abbreviation"),
                            url=department.get("url")))
                    log.info("Registering department '%s' with database." % \
                            department.get("name"))

        except AttributeError as e:
            log.warn("Target %s metadata not defined." % target)
            log.warn("Terminating engine.")
            log.debug(e)
            continue

        # Begin downloading course data.
        try:

            # Check if there are already courses defined for any
            # departments within this university. If there are, skip
            # this target.
            if session.query(Course).join(Department) \
                    .filter(Course.department_id==Department.id)\
                    .filter(Department.university==university)\
                    .count() > 0:
                log.warn("Target %s already has courses defined." % target)

            # Otherwise, go ahead and scrape the course data for this
            # target.
            else:
                scraper.scrape(args)

        except NotImplementedError as e:
            log.warn("Target %s has not been defined. Skipping." % target )

        log.info("Disengaging scraper engine.")