Python clean примеры, trajectory.core.clean Python примеры использования

Пример #1

0

Показать файл

Файл: acm.py Проект: jrouly/trajectory

    def load_acm_file(department, data_file):
        while True:
            title = data_file.readline().rstrip('\n')
            description_raw = data_file.readline().rstrip('\n')
            if not description_raw: break

            # Clean the description
            description = clean(description_raw)
            if description is None:
                continue

            # Generate the appropriate course object.
            new_course = Course(
                number=0, # blank out course numbers
                title=title,
                description=description,
                description_raw=description_raw)
            department.courses.append(new_course)

Пример #2

0

Показать файл

    def load_acm_file(department, data_file):
        while True:
            title = data_file.readline().rstrip('\n')
            description_raw = data_file.readline().rstrip('\n')
            if not description_raw: break

            # Clean the description
            description = clean(description_raw)
            if description is None:
                continue

            # Generate the appropriate course object.
            new_course = Course(
                number=0,  # blank out course numbers
                title=title,
                description=description,
                description_raw=description_raw)
            department.courses.append(new_course)

Пример #3

0

Показать файл

Файл: american.py Проект: lyericly/trajectory

def scrape(args):
    """
    Scrape the available data from the AU catalog into a database.
    """

    import logging

    log = logging.getLogger("root")
    log.info("Scraping American University data.")

    # Constant values.
    catalog_index_url = "http://catalog.american.edu/preview_course_incoming.php?prefix=%s&catoid=3"
    course_url = "http://catalog.american.edu/preview_course.php?catoid=%s&coid=%s"

    # Regex to select only the catid and coid.
    catid_re = re.compile("(^[^']*)|(, this.*$)|(')")

    # Regex to isolate prerequisite course titles.
    prereq_re = re.compile("([A-Z]{3,4})(-)(\d{3})")

    # List of prefixes from the META object.
    prefixes = [department.get("abbreviation").lower() for department in META.get("departments")]

    # Fetch existing metadata objects from database.
    university = META.get("school").get("name")
    university = session.query(University).filter(University.name == university).first()
    departments = {
        department.abbreviation.lower(): department
        for department in session.query(Department).filter(Department.university == university).all()
    }

    prereq_dict = {}  # Dictionary of Course : Prereq match list
    for prefix in prefixes:
        if args.cs and prefix not in ["csc"]:
            continue
        catalog_index = requests.get(catalog_index_url % prefix)
        soup = BeautifulSoup(catalog_index.text)

        # Identify the list of courses.
        course_list = soup.find_all(name="a", onclick=re.compile("showCourse.*"))

        # Identify relevant information for each course.
        for course in course_list:

            # Generate metadata
            log.debug(course.text)
            full_title = re.compile("\s+").split(course.text)
            cnum = full_title[0].split("-")[1]
            title = " ".join(full_title[1:])

            # Identify coid to get description.
            onclick = course["onclick"]
            (catid, coid) = re.sub(catid_re, "", onclick).split(", ")

            # Generate a BeautifulSoup object of the course description.
            course_page = requests.get(course_url % (catid, coid))
            course_soup = BeautifulSoup(course_page.text)
            content = course_soup.find(class_="block_content_popup").hr.br

            # Remove garbage
            [div.extract() for div in content.find_all("div")]

            # Grab extra data
            extra = ""
            em = content.find("em")
            while em:
                if isinstance(em, NavigableString):
                    extra += em
                else:
                    extra += em.text
                rem = em
                em = em.next_sibling
                rem.extract()
            extra.replace("\n", "").replace("\xa0", "").strip(" ")

            # Clean up the description
            description_raw = content.text.replace("\n", "").replace("\xa0", "").strip(" ")
            description = clean(description_raw)
            if description is None:
                continue

            # Identify prerequisites
            prereq_index = extra.find("requis")
            prereq_list = None
            if prereq_index > -1:

                matches = prereq_re.findall(extra[prereq_index:])

                if len(matches) > 0:
                    # Split them up as a dict and store them in a list.
                    prereq_list = [{"d": match[0], "n": match[-1]} for match in matches]  # department  # number

            # Generate the appropriate course object.
            new_course = Course(number=cnum, title=title, description=description, description_raw=description_raw)
            departments[prefix.lower()].courses.append(new_course)

            # Add in the requested list of prereqs if found.
            if prereq_list is not None:
                prereq_dict[new_course] = prereq_list

    # Iterate over the list of courses, now that they've been created, and
    # process their list of requested prerequisites.
    for course, prereq_list in prereq_dict.items():

        # Skip any courses with a 0-length requested prereq list.
        if len(prereq_list) == 0:
            continue

        log.debug(course)
        log.debug(prereq_list)

        # Loop over set of prereqs, if there are multiple.
        department_stack = []
        for prereq in prereq_list:
            n = prereq.get("n")  # prereq course number
            d = prereq.get("d")  # prereq course department abbreviation

            if d.startswith("OR ") or d.startswith("ND "):
                d = d[3:]

            # If this is a referential prereq, look up the last course
            # observed and hope it's the correct department prefix.
            try:
                if d in ["and", "or", ","]:
                    d = department_stack[-1]
                department_stack.append(d)
            except IndexError:  # no previous courses
                continue

            log.debug("Searching for: %s %s" % (d, n))

            # Reference the prerequisite course identified by this
            # department abbreviation and course number.
            prereq_course = (
                session.query(Course)
                .join(Department)
                .filter(Department.university == university)
                .filter(func.lower(Department.abbreviation) == d.lower())
                .filter(Course.number == int(n))
                .first()
            )

            # If a valid course was found, tack it onto the prereq list of
            # the requesting course (course).
            if prereq_course and prereq_course is not course:
                course.prerequisites.append(prereq_course)
            else:
                log.debug("Failed to find course matching '%s %s'." % (d, n))

    log.info("Completed scraping.")

Пример #4

0

Показать файл

Файл: gmu.py Проект: jrouly/trajectory

def scrape(args):
    """
    Scrape the available syllabi from the GMU CS page into a local
    directory.
    """


    import logging
    log = logging.getLogger("root")
    log.info( "Scraping GMU data." )


    # Constant values.
    catalog_index_url = "http://catalog.gmu.edu/preview_course_incoming.php?cattype=combined&prefix=%s"
    course_url = "http://catalog.gmu.edu/preview_course.php?catoid=%s&coid=%s"

    # Regex to select only the catid and coid.
    catid_re = re.compile("(^[^']*)|(, this.*$)|(')")

    # Regex to isolate prerequisite course titles.
    prereq_re = re.compile("([A-Za-z,]{2,4})(\s|\\\\xa0)(\d{3})")

    # List of prefixes from the META object.
    prefixes = [department.get("abbreviation").lower()
                    for department in META.get("departments")]

    # Fetch existing metadata objects from database.
    university = META.get("school").get("name")
    university = session.query(University)\
            .filter(University.name==university)\
            .first()
    departments = {department.abbreviation.lower() : department
                    for department in session.query(Department)\
                        .filter(Department.university==university)\
                        .all()}

    prereq_dict = {} # Dictionary of Course : Prereq match list
    for prefix in prefixes:
        if args.cs and prefix not in ["cs", "ait"]: continue
        catalog_index = requests.get(catalog_index_url % prefix)
        soup = BeautifulSoup(catalog_index.text)

        # Identify the list of courses.
        course_list = soup.find_all(
                name="a",
                onclick=re.compile("showCourse.*"))

        # Identify relevant information for each course.
        for course in course_list:

            # Generate metadata
            log.debug(course.text)
            full_title = re.compile("\s+").split(course.text)
            cnum = full_title[1]
            title = ' '.join(full_title[3:])

            # Identify coid to get description.
            onclick = course['onclick']
            (catid, coid) = re.sub(catid_re, "", onclick).split(", ")

            # Generate a BeautifulSoup object of the course description.
            course_page = requests.get(course_url % (catid, coid))
            course_soup = BeautifulSoup(course_page.text)
            content = course_soup.find(class_="block_content_popup").hr.text

            # Clean up the description.
            description = content
            try:
                description = description[:description.index("Hours of Lecture")]
            except:
                pass

            # Identify prerequisites
            prereq_index = description.find("Prerequisite(s)")
            prereq_list = None
            if prereq_index > -1:

                # Grab the substring of prereqs and find matches.
                notes_index = description.find("Notes")
                prereq_string = description[prereq_index:notes_index]
                description = description[:prereq_index]
                matches = prereq_re.findall(prereq_string)

                if len(matches) > 0:
                    # Split them up as a dict and store them in a list.
                    prereq_list = [{
                            "d": match[0], # department
                            "n": match[2]  # number
                        } for match in matches]

            # Clean the description string
            description_raw = description
            description = clean(description)
            if description is None:
                continue

            # Generate the appropriate course object.
            new_course = Course(
                number=cnum,
                title=title,
                description=description,
                description_raw=description_raw)
            departments[prefix.lower()].courses.append(new_course)

            # Add in the requested list of prereqs if found.
            if prereq_list is not None:
                prereq_dict[new_course] = prereq_list


    # Iterate over the list of courses, now that they've been created, and
    # process their list of requested prerequisites.
    for course, prereq_list in prereq_dict.items():

        # Skip any courses with a 0-length requested prereq list.
        if len(prereq_list) == 0:
            continue

        log.debug(course)
        log.debug(prereq_list)

        # Loop over set of prereqs, if there are multiple.
        department_stack = []
        for prereq in prereq_list:
            n = prereq.get("n") # prereq course number
            d = prereq.get("d") # prereq course department abbreviation

            # If this is a referential prereq, look up the last course
            # observed and hope it's the correct department prefix.
            try:
                if d in ["and", "or", ","]:
                    d = department_stack[-1]
                department_stack.append(d)
            except IndexError: # no previous courses
                continue

            log.debug("Searching for: %s %s" % (d, n))

            # Reference the prerequisite course identified by this
            # department abbreviation and course number.
            prereq_course = session.query(Course) \
                    .join(Department) \
                    .filter(Department.university==university) \
                    .filter(func.lower(Department.abbreviation)==d.lower()) \
                    .filter(Course.number==int(n)) \
                    .first()

            # If a valid course was found, tack it onto the prereq list of
            # the requesting course (course).
            if prereq_course and prereq_course is not course:
                course.prerequisites.append(prereq_course)
            else:
                log.debug("Failed to find course matching '%s %s'." % (d, n))

    log.info("Completed scraping.")

Пример #5

0

Показать файл

def scrape(args):
    """
    Scrape the available syllabi from the Utah catalog into a local
    directory.
    """

    import logging
    log = logging.getLogger("root")
    log.info("Scraping Utah data.")

    # Constant values.
    catalog_index_url = "http://catalog.utah.edu/preview_course_incoming.php?prefix=%s&catoid=6"
    course_url = "http://catalog.utah.edu/preview_course.php?catoid=%s&coid=%s"

    # Regex to select only the catid and coid.
    catid_re = re.compile("(^[^']*)|(, this.*$)|(')")

    # Regex to isolate prerequisite course titles.
    prereq_re = re.compile(
        "(([A-Z]{2,5})|([A-Z]{2}(\s|\\\\xa0)[A-Z]{2})|(H(\s|\\\\xa0)[A-Z]{3}))(\s|\\\\xa0)(\d{4})"
    )

    # List of prefixes from the META object.
    prefixes = [
        department.get("abbreviation").lower()
        for department in META.get("departments")
    ]

    # Fetch existing metadata objects from database.
    university = META.get("school").get("name")
    university = session.query(University)\
            .filter(University.name==university)\
            .first()
    departments = {department.abbreviation.lower() : department
                    for department in session.query(Department)\
                        .filter(Department.university==university)\
                        .all()}

    prereq_dict = {}  # Dictionary of Course : Prereq match list
    for prefix in prefixes:
        if args.cs and prefix not in ["cs"]: continue
        catalog_index = requests.get(catalog_index_url % prefix)
        soup = BeautifulSoup(catalog_index.text)

        # Identify the list of courses.
        course_list = soup.find_all(name="a",
                                    onclick=re.compile("showCourse.*"))

        # Identify relevant information for each course.
        for course in course_list:

            # Generate metadata
            log.debug(course.text)
            full_title = re.compile("\s+").split(course.text)
            cnum = full_title[1]
            title = ' '.join(full_title[3:])

            # Identify coid to get description.
            onclick = course['onclick']
            (catid, coid) = re.sub(catid_re, "", onclick).split(", ")

            # Generate a BeautifulSoup object of the course description.
            course_page = requests.get(course_url % (catid, coid))
            course_soup = BeautifulSoup(course_page.text)
            content = course_soup.find(class_="block_content_popup").hr

            # Identify all the course heading data.
            strongs = content.find_all("strong")
            headings = {}
            tag = content
            for strong in strongs:
                tag = strong.next_sibling
                text = ""
                while True:
                    if tag.name == "br": break
                    if isinstance(tag, NavigableString):
                        text += tag
                    else:
                        text += tag.text
                    tag = tag.next_sibling
                headings[strong.text] = text

            # Update the cursor to post-heading data.
            content = tag

            # Remove the footer links
            [a.extract() for a in tag.find_all("a")]

            # Clean up the description
            description_raw = content.text.replace('\n', '')
            description = clean(description_raw)
            if description is None:
                continue

            # Identify prerequisites
            prereq_string = headings.get("Enrollment Requirement:")
            prereq_list = None
            if prereq_string is not None:

                matches = prereq_re.findall(prereq_string)

                if len(matches) > 0:
                    # Split them up as a dict and store them in a list.
                    prereq_list = [
                        {
                            "d": match[0],  # department
                            "n": match[-1]  # number
                        } for match in matches
                    ]

            # Generate the appropriate course object.
            new_course = Course(number=cnum,
                                title=title,
                                description=description,
                                description_raw=description_raw)
            departments[prefix.lower()].courses.append(new_course)

            # Add in the requested list of prereqs if found.
            if prereq_list is not None:
                prereq_dict[new_course] = prereq_list

    # Iterate over the list of courses, now that they've been created, and
    # process their list of requested prerequisites.
    for course, prereq_list in prereq_dict.items():

        # Skip any courses with a 0-length requested prereq list.
        if len(prereq_list) == 0:
            continue

        log.debug(course)
        log.debug(prereq_list)

        # Loop over set of prereqs, if there are multiple.
        department_stack = []
        for prereq in prereq_list:
            n = prereq.get("n")  # prereq course number
            d = prereq.get("d")  # prereq course department abbreviation

            if d.startswith("OR ") or d.startswith("ND "):
                d = d[3:]

            # If this is a referential prereq, look up the last course
            # observed and hope it's the correct department prefix.
            try:
                if d in ["and", "or", ","]:
                    d = department_stack[-1]
                department_stack.append(d)
            except IndexError:  # no previous courses
                continue

            log.debug("Searching for: %s %s" % (d, n))

            # Reference the prerequisite course identified by this
            # department abbreviation and course number.
            prereq_course = session.query(Course) \
                    .join(Department) \
                    .filter(Department.university==university) \
                    .filter(func.lower(Department.abbreviation)==d.lower()) \
                    .filter(Course.number==int(n)) \
                    .first()

            # If a valid course was found, tack it onto the prereq list of
            # the requesting course (course).
            if prereq_course and prereq_course is not course:
                course.prerequisites.append(prereq_course)
            else:
                log.debug("Failed to find course matching '%s %s'." % (d, n))

    log.info("Completed scraping.")

Пример #6

0

Показать файл

Файл: pdx.py Проект: jrouly/trajectory

def scrape(args):
    """
    Scrape the available syllabi from the PDX CS page into a local
    directory.
    """


    import logging
    log = logging.getLogger("root")
    log.info( "Scraping PDX CS data." )


    # Construct a soup of the index.
    course_index_url = "http://www.pdx.edu/computer-science/courses"
    course_index = requests.get(course_index_url)
    soup = BeautifulSoup(course_index.text)

    # Identify the list of courses.
    course_list = soup.find("h3", text="Course List")\
                      .find_next_sibling("ul")\
                      .find_all("a")

    # Fetch existing metadata objects from database.
    university = META.get("school").get("name")
    university = session.query(University)\
            .filter(University.name==university)\
            .first()
    departments = {department.abbreviation.lower() : department
                    for department in session.query(Department)\
                        .filter(Department.university==university)\
                        .all()}

    for course in course_list:
        log.debug(course.text)
        full_title = re.compile("\s+").split(course.text)
        prefix = full_title[0]
        cnum = re.sub('[/]', '-', full_title[1])
        title = ' '.join(full_title[2:])

        if args.cs and prefix.lower() not in ["cs"]: continue

        try:
            course_url = course['href']
            course_soup = BeautifulSoup(requests.get(course_url).text)
        except:
            log.warn("Unable to parse course page.")
            log.warn(course_url)
            continue

        # Find the course description based on its neighbour
        cdesc_re = re.compile(".*Course Description.*")
        cdesc = course_soup.find("table").find(text=cdesc_re)
        if not cdesc.next_sibling:
            cdesc = cdesc.find_parent("td")

        # If there's no course description found, well, forget it
        try:
            description = cdesc.find_next_sibling("td").text
        except:
            log.warn("No course description available.")
            log.warn(course_url)
            continue

        # Clean the description string.
        description_raw = description
        description = clean(description)
        if description is None:
            continue

        # Find the course prerequisite list based on its neighbour
        prereq_re = re.compile(".*Prerequi?sites.*")
        prereq = course_soup.find("table").find(text=prereq_re)
        if not prereq.next_sibling:
            prereq = prereq.find_parent("td")

        # If there's no prereq list found, leave it as None
        try:
            prereq = prereq.find_next_sibling("td").text
        except:
            prereq = None

        # Generate the appropriate course object.
        departments[prefix.lower()].courses.append(Course(
            number=cnum,
            title=title,
            description_raw=description_raw,
            description=description))

    log.info( "Completed scraping." )

Пример #7

0

Показать файл

def scrape(args):
    """
    Scrape the available syllabi from the SC CS page into a local
    directory.
    """


    import logging
    log = logging.getLogger("root")
    log.info("Scraping SC CS data.")


    # Generate a BeautifulSoup object.
    catalog_index_url = "http://bulletin.sc.edu/content.php?catoid=36&navoid=4242&filter[27]=CSCE"
    catalog_index = requests.get(catalog_index_url)
    soup = BeautifulSoup(catalog_index.text)

    # Identify the list of courses.
    course_list = soup.find_all(
            name="a",
            onclick=re.compile("showCourse.*"))

    # Select only the catoid and coid.
    catoid_re = re.compile("(?<=catoid=)\d+")
    coid_re = re.compile("(?<=coid=)\d+")

    # Piecewise course page url.
    course_url = "http://bulletin.sc.edu/preview_course.php?catoid=%s&coid=%s"

    # Fetch existing metadata objects from database.
    university = META.get("school").get("name")
    university = session.query(University)\
            .filter(University.name==university)\
            .first()
    departments = {department.abbreviation.lower() : department
                    for department in session.query(Department)\
                        .filter(Department.university==university)\
                        .all()}

    # Identify relevant information for each course.
    prereqs = {}
    for course in course_list:

        # Generate metadata
        log.debug(course.text)
        full_title = re.compile("\s+").split(course.text)
        prefix = full_title[0]
        cnum = full_title[1]
        title = ' '.join(full_title[3:])
        title = title.replace("'", "")

        # Identify coid to get description.
        href = course['href']
        catoid = catoid_re.search(href).group(0)
        coid = coid_re.search(href).group(0)

        # Generate a BeautifulSoup object of the course description.
        course_page = requests.get(course_url % (catoid, coid))
        course_soup = BeautifulSoup(course_page.text)
        content = course_soup.h1.next_sibling.next_sibling.text

        # Clean up the description.
        def strip_substring(body, substring):
            try:    return body[:body.index(substring)]
            except: return body

        # Clean the content string
        content = strip_substring(content, "Print-Friendly Page Close Window")

        # Clean the description string
        description_raw = content
        description_raw = strip_substring(description_raw, "Prereq")
        description_raw = strip_substring(description_raw, "Coreq")
        description_raw = strip_substring(description_raw, "Note:")
        description_raw = strip_substring(description_raw, "Cross-listed")

        description = clean(description_raw)
        if description is None:
            continue

        # Generate the appropriate course object.
        departments[prefix.lower()].courses.append(Course(
            number=cnum,
            title=title,
            description_raw=description_raw,
            description=description))

    log.info( "Completed scraping." )

Пример #8

0

Показать файл

def scrape(args):
    """
    Scrape the available syllabi from the LSU CS page into a local
    directory.
    """


    import logging
    log = logging.getLogger("root")
    log.info( "Scraping LSU CS data." )


    # Generate a BeautifulSoup object.
    catalog_index_url = "http://catalog.lsu.edu/content.php?filter[27]=CSC&cur_cat_oid=6&navoid=538"
    catalog_index = requests.get( catalog_index_url )
    soup = BeautifulSoup( catalog_index.text )

    # Identify the list of courses.
    course_list = soup.find_all(
            name="a",
            onclick=re.compile("showCourse.*"))

    # Select only the catoid and coid.
    catoid_re = re.compile("(?<=catoid=)\d+")
    coid_re = re.compile("(?<=coid=)\d+")

    # Piecewise course page url.
    course_url = "http://catalog.lsu.edu/preview_course_nopop.php?catoid=%s&coid=%s"

    # Fetch existing metadata objects from database.
    university = META.get("school").get("name")
    university = session.query(University)\
            .filter(University.name==university)\
            .first()
    departments = {department.abbreviation.lower() : department
                    for department in session.query(Department)\
                        .filter(Department.university==university)\
                        .all()}

    # Identify relevant information for each course.
    prereqs = {}
    for course in course_list:

        # Generate metadata
        log.debug(course.text)
        full_title = re.compile("\s+").split(course.text)
        prefix = full_title[0]
        cnum = full_title[1]
        title = ' '.join(full_title[2:-1])
        title = title.replace("'", "")

        # Identify coid to get description.
        href = course['href']
        catoid = catoid_re.search(href).group(0)
        coid = coid_re.search(href).group(0)

        # Generate a BeautifulSoup object of the course description.
        course_page = requests.get(course_url % (catoid, coid))
        course_soup = BeautifulSoup(course_page.text)
        content = course_soup.find(class_="block_content").hr

        # Remove the metadata.
        [em.decompose() for em in content.find_all("em") if True]

        # Clean the description string
        description_raw = content.text
        description = clean(content.text)
        if description is None:
            continue

        # Generate the appropriate course object.
        departments[prefix.lower()].courses.append(Course(
            number=cnum,
            title=title,
            description_raw=description_raw,
            description=description))

    log.info( "Completed scraping." )

Пример #9

0

Показать файл

Файл: utk.py Проект: jrouly/trajectory

def scrape(args):
    """
    Scrape the available syllabi from the UTK catalog into a local
    directory.
    """

    import logging
    log = logging.getLogger("root")
    log.info("Scraping UTK data.")

    # Constant values.
    catalog_index_url = "http://catalog.utk.edu/preview_course_incoming.php?prefix=%s&catoid=18"
    course_url = "http://catalog.utk.edu/preview_course.php?catoid=%s&coid=%s"

    # Regex to select only the catid and coid.
    catid_re = re.compile("(^[^']*)|(, this.*$)|(')")

    # Regex to isolate prerequisite course titles.
    prereq_re = re.compile("(\s|\\\\xa0)(\d{3})")

    # List of prefixes from the META object.
    prefixes = [
        department.get("abbreviation").lower()
        for department in META.get("departments")
    ]

    # Fetch existing metadata objects from database.
    university = META.get("school").get("name")
    university = session.query(University)\
            .filter(University.name==university)\
            .first()
    departments = {department.abbreviation.lower() : department
                    for department in session.query(Department)\
                        .filter(Department.university==university)\
                        .all()}

    prereq_dict = {}  # Dictionary of Course : Prereq match list
    for prefix in prefixes:
        if args.cs and prefix not in ["cosc"]: continue
        catalog_index = requests.get(catalog_index_url % prefix)
        soup = BeautifulSoup(catalog_index.text)

        # Identify the list of courses.
        course_list = soup.find_all(name="a",
                                    onclick=re.compile("showCourse.*"))

        # Identify relevant information for each course.
        for course in course_list:

            # Generate metadata
            log.debug(course.text)
            full_title = re.compile("\s+").split(course.text)
            cnum = full_title[1]
            title = ' '.join(full_title[3:])

            # Identify coid to get description.
            onclick = course['onclick']
            (catid, coid) = re.sub(catid_re, "", onclick).split(", ")

            # Generate a BeautifulSoup object of the course description.
            course_page = requests.get(course_url % (catid, coid))
            course_soup = BeautifulSoup(course_page.text)
            content = course_soup.find(class_="block_content_popup").hr

            # Clean out garbage
            [div.extract() for div in content.find_all("div")]
            extra = ' '.join(
                [em.extract().text for em in content.find_all("em")])
            extra.strip(' ')

            # Clean up the description
            description_raw = content.text.replace('\n', '').strip(' ')
            description = clean(description_raw)
            if description is None:
                continue

            # Identify prerequisites
            prereq_index = extra.find("requisite")
            prereq_list = None
            if prereq_index > -1:

                prereq_string = extra[prereq_index:]
                matches = prereq_re.findall(prereq_string)

                if len(matches) > 0:
                    # Split them up as a dict and store them in a list.
                    # Naiive assumption that every course number is within
                    # the same department because the department titles are
                    # written out and not matchable.
                    prereq_list = [
                        {
                            "d": prefix.lower(),  # department
                            "n": match[-1]  # number
                        } for match in matches
                    ]

            # Generate the appropriate course object.
            new_course = Course(number=cnum,
                                title=title,
                                description=description,
                                description_raw=description_raw)
            departments[prefix.lower()].courses.append(new_course)

            # Add in the requested list of prereqs if found.
            if prereq_list is not None:
                prereq_dict[new_course] = prereq_list

    # Iterate over the list of courses, now that they've been created, and
    # process their list of requested prerequisites.
    for course, prereq_list in prereq_dict.items():

        # Skip any courses with a 0-length requested prereq list.
        if len(prereq_list) == 0:
            continue

        log.debug(course)
        log.debug(prereq_list)

        # Loop over set of prereqs, if there are multiple.
        department_stack = []
        for prereq in prereq_list:
            n = prereq.get("n")  # prereq course number
            d = prereq.get("d")  # prereq course department abbreviation

            log.debug("Searching for: %s %s" % (d, n))

            # Reference the prerequisite course identified by this
            # department abbreviation and course number.
            prereq_course = session.query(Course) \
                    .join(Department) \
                    .filter(Department.university==university) \
                    .filter(func.lower(Department.abbreviation)==d.lower()) \
                    .filter(Course.number==int(n)) \
                    .first()

            # If a valid course was found, tack it onto the prereq list of
            # the requesting course (course).
            if prereq_course and prereq_course is not course:
                course.prerequisites.append(prereq_course)
            else:
                log.debug("Failed to find course matching '%s %s'." % (d, n))

    log.info("Completed scraping.")

Пример #10

0

Показать файл

Файл: rpi.py Проект: jrouly/trajectory

def scrape(args):
    """
    Scrape the available syllabi from the RPI CS page into a local
    directory.
    """


    import logging
    log = logging.getLogger("root")
    log.info( "Scraping RPI CS data." )


    # Generate a BeautifulSoup object.
    catalog_index_url = "http://catalog.rpi.edu/content.php?filter[27]=CSCI&cur_cat_oid=13&navoid=313"
    catalog_index = requests.get( catalog_index_url )
    soup = BeautifulSoup( catalog_index.text )

    # Identify the list of courses.
    course_list = soup.find_all(
            name="a",
            onclick=re.compile("showCourse.*"))

    # Select only the catoid and coid.
    catoid_re = re.compile("(?<=catoid=)\d+")
    coid_re = re.compile("(?<=coid=)\d+")

    # Piecewise course page url.
    course_url = "http://catalog.rpi.edu/preview_course_nopop.php?catoid=%s&coid=%s"

    # Fetch existing metadata objects from database.
    university = META.get("school").get("name")
    university = session.query(University)\
            .filter(University.name==university)\
            .first()
    departments = {department.abbreviation.lower() : department
                    for department in session.query(Department)\
                        .filter(Department.university==university)\
                        .all()}

    # Identify relevant information for each course.
    prereqs = {}
    for course in course_list:

        # Generate metadata
        log.debug(course.text)
        full_title = re.compile("\s+").split(course.text)
        prefix = full_title[0]
        cnum = full_title[1]
        title = ' '.join(full_title[3:])
        title = title.replace("'", "")

        # Identify coid to get description.
        href = course['href']
        catoid = catoid_re.search(href).group(0)
        coid = coid_re.search(href).group(0)

        # Generate a BeautifulSoup object of the course description.
        course_page = requests.get( course_url % (catoid, coid) )
        course_soup = BeautifulSoup( course_page.text )
        content = course_soup.find(class_="block_content").hr.text

        # Clean up the description.
        description = content
        try:
            description = description[:description.index("Credit Hours")]
            description = description[:description.index("When Offered")]
        except:
            pass

        # Identify prerequisites
        # TODO: Match these up with their database entries.
        prereq_index = description.find("Prerequisit")
        if prereq_index > -1:
            prereq_string = description[prereq_index:]
            description = description[:prereq_index]

            prereq_re = re.compile("\w{2,4}\s\d{3}")
            matches = re.findall(prereq_re, prereq_string)
            if len(matches) > 0:
                prereqs["%s %s" % (prefix, cnum)] = matches

        # Clean the description string
        description_raw = description
        description = clean(description)
        if description is None:
            continue

        # Generate the appropriate course object.
        departments[prefix.lower()].courses.append(Course(
            number=cnum,
            title=title,
            description_raw=description_raw,
            description=description))

    log.info( "Completed scraping." )

Пример #11

0

Показать файл

Файл: stanford.py Проект: jrouly/trajectory

def scrape(args):
    """
    Scrape the available syllabi from the Stanford CS page into a local
    directory.
    """


    import logging
    log = logging.getLogger("root")
    log.info("Scraping Stanford CS data.")


    # Fetch existing metadata objects from database.
    university = META.get("school").get("name")
    university = session.query(University)\
            .filter(University.name==university)\
            .first()
    departments = {department.abbreviation.lower() : department
                    for department in session.query(Department)\
                        .filter(Department.university==university)\
                        .all()}


    # Static connection information
    catalog_url = "https://explorecourses.stanford.edu/search?q=CS&view=catalog&filter-departmentcode-CS=on&filter-term-Spring=on&filter-coursestatus-Active=on&page="
    catalog_page = 0
    catalog_page_limit = 8
    headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36\
            (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
    }


    # Loop forever until we get to the last page and can't "next >" any
    # more, in which case we stop.
    while True:

        # There are currently only 8 pages, so break after we see that
        # many.
        if catalog_page == catalog_page_limit:
            break

        # Generate a BeautifulSoup object.
        response = requests.get(catalog_url + str(catalog_page),
                                headers=headers)
        soup = BeautifulSoup(response.text)

        # Identify the course list.
        course_list = soup.find_all(class_="searchResult")

        # Identify relevant information for each course.
        for course in course_list:

            # Generate metadata
            title = course.find(class_="courseTitle").text
            identifier = re.compile("\s+").split(
                    course.find(class_="courseNumber").text)
            prefix = identifier[0]
            cnum = identifier[1][:-1]
            description = course.find(class_="courseDescription").text

            log.debug(identifier)

            # Identify prerequisites or corequisites.
            # TODO: Match these up with their database entries.
            prereq_index = description.find("Prerequisite")
            if prereq_index > -1:
                prereq_string = description[prereq_index:]
                description = description[:prereq_index]

            # Clean the description string
            description_raw = description
            description = clean(description)
            if description is None:
                continue

            # Generate the appropriate course object.
            departments[prefix.lower()].courses.append(Course(
                number=cnum,
                title=title,
                description_raw=description_raw,
                description=description))

        # Go to the next page.
        catalog_page = catalog_page + 1


    log.info( "Completed scraping." )

Пример #12

0

Показать файл

Файл: rpi.py Проект: jrouly/trajectory

def scrape(args):
    """
    Scrape the available syllabi from the RPI CS page into a local
    directory.
    """

    import logging
    log = logging.getLogger("root")
    log.info("Scraping RPI CS data.")

    # Generate a BeautifulSoup object.
    catalog_index_url = "http://catalog.rpi.edu/content.php?filter[27]=CSCI&cur_cat_oid=13&navoid=313"
    catalog_index = requests.get(catalog_index_url)
    soup = BeautifulSoup(catalog_index.text)

    # Identify the list of courses.
    course_list = soup.find_all(name="a", onclick=re.compile("showCourse.*"))

    # Select only the catoid and coid.
    catoid_re = re.compile("(?<=catoid=)\d+")
    coid_re = re.compile("(?<=coid=)\d+")

    # Piecewise course page url.
    course_url = "http://catalog.rpi.edu/preview_course_nopop.php?catoid=%s&coid=%s"

    # Fetch existing metadata objects from database.
    university = META.get("school").get("name")
    university = session.query(University)\
            .filter(University.name==university)\
            .first()
    departments = {department.abbreviation.lower() : department
                    for department in session.query(Department)\
                        .filter(Department.university==university)\
                        .all()}

    # Identify relevant information for each course.
    prereqs = {}
    for course in course_list:

        # Generate metadata
        log.debug(course.text)
        full_title = re.compile("\s+").split(course.text)
        prefix = full_title[0]
        cnum = full_title[1]
        title = ' '.join(full_title[3:])
        title = title.replace("'", "")

        # Identify coid to get description.
        href = course['href']
        catoid = catoid_re.search(href).group(0)
        coid = coid_re.search(href).group(0)

        # Generate a BeautifulSoup object of the course description.
        course_page = requests.get(course_url % (catoid, coid))
        course_soup = BeautifulSoup(course_page.text)
        content = course_soup.find(class_="block_content").hr.text

        # Clean up the description.
        description = content
        try:
            description = description[:description.index("Credit Hours")]
            description = description[:description.index("When Offered")]
        except:
            pass

        # Identify prerequisites
        # TODO: Match these up with their database entries.
        prereq_index = description.find("Prerequisit")
        if prereq_index > -1:
            prereq_string = description[prereq_index:]
            description = description[:prereq_index]

            prereq_re = re.compile("\w{2,4}\s\d{3}")
            matches = re.findall(prereq_re, prereq_string)
            if len(matches) > 0:
                prereqs["%s %s" % (prefix, cnum)] = matches

        # Clean the description string
        description_raw = description
        description = clean(description)
        if description is None:
            continue

        # Generate the appropriate course object.
        departments[prefix.lower()].courses.append(
            Course(number=cnum,
                   title=title,
                   description_raw=description_raw,
                   description=description))

    log.info("Completed scraping.")

Пример #13

0

Показать файл

def scrape(args):
    """
    Scrape the available syllabi from the KSU CS page into a local
    directory.
    """

    import logging
    log = logging.getLogger("root")
    log.info("Scraping KSU data.")

    # Constant values.
    catalog_index_url = "http://catalog.k-state.edu/content.php?filter[27]=%s&cur_cat_oid=13&navoid=1425"
    course_url = "http://catalog.k-state.edu/preview_course_nopop.php?catoid=%s&coid=%s"

    # Scraper regexes.
    catoid_re = re.compile("(?<=catoid=)\d+")
    coid_re = re.compile("(?<=coid=)\d+")
    prereq_re = re.compile("([A-Za-z,]{2,5})(\s|\\\\xa0)(\d{3})")

    # Fetch existing metadata objects from database.
    university = META.get("school").get("name")
    university = session.query(University)\
            .filter(University.name==university)\
            .first()
    departments = {department.abbreviation.lower() : department
                    for department in session.query(Department)\
                        .filter(Department.university==university)\
                        .all()}

    prereq_dict = {}  # Dictionary of Course : Prereq match list
    for prefix in departments.keys():
        if args.cs and prefix not in ["cis"]: continue
        catalog_index = requests.get(catalog_index_url % prefix)
        soup = BeautifulSoup(catalog_index.text)

        # Identify the list of courses.
        course_list = soup.find_all(name="a",
                                    onclick=re.compile("showCourse.*"))

        # Identify relevant information for each course.
        for course in course_list:

            # Generate metadata
            log.debug(course.text)
            full_title = re.compile("\s+").split(course.text)
            #prefix = full_title[0]
            cnum = full_title[1]
            title = ' '.join(full_title[3:])
            title = title.replace("'", "")

            # Identify coid to get description.
            href = course['href']
            catoid = catoid_re.search(href).group(0)
            coid = coid_re.search(href).group(0)

            # Generate a BeautifulSoup object of the course description.
            course_page = requests.get(course_url % (catoid, coid))
            course_soup = BeautifulSoup(course_page.text)

            # Identify the components of the description and its metadata.
            result_set = course_soup.find(class_="block_content") \
                    .table \
                    .find_next_sibling("p") \
                    .h1 \
                    .find_next_siblings()

            # Join them together as text.
            content = ' '.join([r.text for r in result_set[1:]])

            # Clean up the description.
            def strip_substring(body, substring):
                try:
                    return body[:body.index(substring)]
                except:
                    return body

            description = content
            description = strip_substring(description, "Note")
            description = strip_substring(description, "Requisites")
            description = strip_substring(description, "When Offered")
            description = strip_substring(description, "UGE course")

            # Identify prerequisites.
            prereq_index = content.find("Requisites")
            prereq_list = None
            if prereq_index > -1:

                # Grab the substring of prereqs and find matches.
                prereq_string = content[prereq_index:]
                prereq_string = strip_substring(prereq_string, "Note")
                prereq_string = strip_substring(prereq_string, "When Offered")
                matches = prereq_re.findall(prereq_string)

                if len(matches) > 0:
                    # Split them up as a dict and store them in a list.
                    prereq_list = [
                        {
                            "d": match[0],  # department
                            "n": match[2]  # number
                        } for match in matches
                    ]

            # Clean the description string
            description_raw = description
            description = clean(description_raw)
            if description is None:
                continue

            # Generate the appropriate course object.
            new_course = Course(number=cnum,
                                title=title,
                                description=description,
                                description_raw=description_raw)
            departments[prefix].courses.append(new_course)

            # Add in the requested list of prereqs if found.
            if prereq_list is not None:
                prereq_dict[new_course] = prereq_list

    # Iterate over the list of courses, now that they've been created, and
    # process their list of requested prerequisites.
    for course, prereq_list in prereq_dict.items():

        # Skip any courses with a 0-length requested prereq list.
        if len(prereq_list) == 0:
            continue

        log.debug(course)
        log.debug(prereq_list)

        # Loop over set of prereqs, if there are multiple.
        department_stack = []
        for prereq in prereq_list:
            n = prereq.get("n")  # prereq course number
            d = prereq.get("d")  # prereq course department abbreviation

            # If this is a referential prereq, look up the last course
            # observed and hope it's the correct department prefix.
            try:
                if d in ["and", "or", ","]:
                    d = department_stack[-1]
                department_stack.append(d)
            except IndexError:  # no previous courses
                continue

            log.debug("Searching for: %s %s" % (d, n))

            # Reference the prerequisite course identified by this
            # department abbreviation and course number.
            prereq_course = session.query(Course) \
                    .join(Department) \
                    .filter(Department.university==university) \
                    .filter(func.lower(Department.abbreviation)==d.lower()) \
                    .filter(Course.number==int(n)) \
                    .first()

            # If a valid course was found, tack it onto the prereq list of
            # the requesting course (course).
            if prereq_course and prereq_course is not course:
                course.prerequisites.append(prereq_course)
            else:
                log.debug("Failed to find course matching '%s %s'." % (d, n))

    log.info("Completed scraping.")

Пример #14

0

Показать файл

Файл: utah.py Проект: jrouly/trajectory

def scrape(args):
    """
    Scrape the available syllabi from the Utah catalog into a local
    directory.
    """


    import logging
    log = logging.getLogger("root")
    log.info("Scraping Utah data.")


    # Constant values.
    catalog_index_url = "http://catalog.utah.edu/preview_course_incoming.php?prefix=%s&catoid=6"
    course_url = "http://catalog.utah.edu/preview_course.php?catoid=%s&coid=%s"

    # Regex to select only the catid and coid.
    catid_re = re.compile("(^[^']*)|(, this.*$)|(')")

    # Regex to isolate prerequisite course titles.
    prereq_re = re.compile("(([A-Z]{2,5})|([A-Z]{2}(\s|\\\\xa0)[A-Z]{2})|(H(\s|\\\\xa0)[A-Z]{3}))(\s|\\\\xa0)(\d{4})")

    # List of prefixes from the META object.
    prefixes = [department.get("abbreviation").lower()
                    for department in META.get("departments")]

    # Fetch existing metadata objects from database.
    university = META.get("school").get("name")
    university = session.query(University)\
            .filter(University.name==university)\
            .first()
    departments = {department.abbreviation.lower() : department
                    for department in session.query(Department)\
                        .filter(Department.university==university)\
                        .all()}

    prereq_dict = {} # Dictionary of Course : Prereq match list
    for prefix in prefixes:
        if args.cs and prefix not in ["cs"]: continue
        catalog_index = requests.get(catalog_index_url % prefix)
        soup = BeautifulSoup(catalog_index.text)

        # Identify the list of courses.
        course_list = soup.find_all(
                name="a",
                onclick=re.compile("showCourse.*"))

        # Identify relevant information for each course.
        for course in course_list:

            # Generate metadata
            log.debug(course.text)
            full_title = re.compile("\s+").split(course.text)
            cnum = full_title[1]
            title = ' '.join(full_title[3:])

            # Identify coid to get description.
            onclick = course['onclick']
            (catid, coid) = re.sub(catid_re, "", onclick).split(", ")

            # Generate a BeautifulSoup object of the course description.
            course_page = requests.get(course_url % (catid, coid))
            course_soup = BeautifulSoup(course_page.text)
            content = course_soup.find(class_="block_content_popup").hr

            # Identify all the course heading data.
            strongs = content.find_all("strong")
            headings = {}
            tag = content
            for strong in strongs:
                tag = strong.next_sibling
                text = ""
                while True:
                    if tag.name == "br": break
                    if isinstance(tag, NavigableString):
                        text += tag
                    else:
                        text += tag.text
                    tag = tag.next_sibling
                headings[strong.text] = text

            # Update the cursor to post-heading data.
            content = tag

            # Remove the footer links
            [a.extract() for a in tag.find_all("a")]

            # Clean up the description
            description_raw = content.text.replace('\n', '')
            description = clean(description_raw)
            if description is None:
                continue

            # Identify prerequisites
            prereq_string = headings.get("Enrollment Requirement:")
            prereq_list = None
            if prereq_string is not None:

                matches = prereq_re.findall(prereq_string)

                if len(matches) > 0:
                    # Split them up as a dict and store them in a list.
                    prereq_list = [{
                            "d": match[0], # department
                            "n": match[-1]  # number
                        } for match in matches]

            # Generate the appropriate course object.
            new_course = Course(
                number=cnum,
                title=title,
                description=description,
                description_raw=description_raw)
            departments[prefix.lower()].courses.append(new_course)

            # Add in the requested list of prereqs if found.
            if prereq_list is not None:
                prereq_dict[new_course] = prereq_list

    # Iterate over the list of courses, now that they've been created, and
    # process their list of requested prerequisites.
    for course, prereq_list in prereq_dict.items():

        # Skip any courses with a 0-length requested prereq list.
        if len(prereq_list) == 0:
            continue

        log.debug(course)
        log.debug(prereq_list)

        # Loop over set of prereqs, if there are multiple.
        department_stack = []
        for prereq in prereq_list:
            n = prereq.get("n") # prereq course number
            d = prereq.get("d") # prereq course department abbreviation

            if d.startswith("OR ") or d.startswith("ND "):
                d = d[3:]

            # If this is a referential prereq, look up the last course
            # observed and hope it's the correct department prefix.
            try:
                if d in ["and", "or", ","]:
                    d = department_stack[-1]
                department_stack.append(d)
            except IndexError: # no previous courses
                continue

            log.debug("Searching for: %s %s" % (d, n))

            # Reference the prerequisite course identified by this
            # department abbreviation and course number.
            prereq_course = session.query(Course) \
                    .join(Department) \
                    .filter(Department.university==university) \
                    .filter(func.lower(Department.abbreviation)==d.lower()) \
                    .filter(Course.number==int(n)) \
                    .first()

            # If a valid course was found, tack it onto the prereq list of
            # the requesting course (course).
            if prereq_course and prereq_course is not course:
                course.prerequisites.append(prereq_course)
            else:
                log.debug("Failed to find course matching '%s %s'." % (d, n))

    log.info("Completed scraping.")

Пример #15

0

Показать файл

Файл: ksu.py Проект: jrouly/trajectory

def scrape(args):
    """
    Scrape the available syllabi from the KSU CS page into a local
    directory.
    """


    import logging
    log = logging.getLogger("root")
    log.info( "Scraping KSU data." )


    # Constant values.
    catalog_index_url = "http://catalog.k-state.edu/content.php?filter[27]=%s&cur_cat_oid=13&navoid=1425"
    course_url = "http://catalog.k-state.edu/preview_course_nopop.php?catoid=%s&coid=%s"

    # Scraper regexes.
    catoid_re = re.compile("(?<=catoid=)\d+")
    coid_re = re.compile("(?<=coid=)\d+")
    prereq_re = re.compile("([A-Za-z,]{2,5})(\s|\\\\xa0)(\d{3})")

    # Fetch existing metadata objects from database.
    university = META.get("school").get("name")
    university = session.query(University)\
            .filter(University.name==university)\
            .first()
    departments = {department.abbreviation.lower() : department
                    for department in session.query(Department)\
                        .filter(Department.university==university)\
                        .all()}

    prereq_dict = {} # Dictionary of Course : Prereq match list
    for prefix in departments.keys():
        if args.cs and prefix not in ["cis"]: continue
        catalog_index = requests.get(catalog_index_url % prefix)
        soup = BeautifulSoup(catalog_index.text)

        # Identify the list of courses.
        course_list = soup.find_all(
                name="a",
                onclick=re.compile("showCourse.*"))

        # Identify relevant information for each course.
        for course in course_list:

            # Generate metadata
            log.debug(course.text)
            full_title = re.compile("\s+").split(course.text)
            #prefix = full_title[0]
            cnum = full_title[1]
            title = ' '.join(full_title[3:])
            title = title.replace("'", "")

            # Identify coid to get description.
            href = course['href']
            catoid = catoid_re.search(href).group(0)
            coid = coid_re.search(href).group(0)

            # Generate a BeautifulSoup object of the course description.
            course_page = requests.get(course_url % (catoid, coid))
            course_soup = BeautifulSoup(course_page.text)

            # Identify the components of the description and its metadata.
            result_set = course_soup.find(class_="block_content") \
                    .table \
                    .find_next_sibling("p") \
                    .h1 \
                    .find_next_siblings()

            # Join them together as text.
            content = ' '.join([r.text for r in result_set[1:]])

            # Clean up the description.
            def strip_substring(body, substring):
                try:    return body[:body.index(substring)]
                except: return body

            description = content
            description = strip_substring(description, "Note")
            description = strip_substring(description, "Requisites")
            description = strip_substring(description, "When Offered")
            description = strip_substring(description, "UGE course")

            # Identify prerequisites.
            prereq_index = content.find("Requisites")
            prereq_list = None
            if prereq_index > -1:

                # Grab the substring of prereqs and find matches.
                prereq_string = content[prereq_index:]
                prereq_string = strip_substring(prereq_string, "Note")
                prereq_string = strip_substring(prereq_string, "When Offered")
                matches = prereq_re.findall(prereq_string)

                if len(matches) > 0:
                    # Split them up as a dict and store them in a list.
                    prereq_list = [{
                            "d": match[0], # department
                            "n": match[2]  # number
                        } for match in matches]

            # Clean the description string
            description_raw = description
            description = clean(description_raw)
            if description is None:
                continue

            # Generate the appropriate course object.
            new_course = Course(
                number=cnum,
                title=title,
                description=description,
                description_raw=description_raw)
            departments[prefix].courses.append(new_course)

            # Add in the requested list of prereqs if found.
            if prereq_list is not None:
                prereq_dict[new_course] = prereq_list


    # Iterate over the list of courses, now that they've been created, and
    # process their list of requested prerequisites.
    for course, prereq_list in prereq_dict.items():

        # Skip any courses with a 0-length requested prereq list.
        if len(prereq_list) == 0:
            continue

        log.debug(course)
        log.debug(prereq_list)

        # Loop over set of prereqs, if there are multiple.
        department_stack = []
        for prereq in prereq_list:
            n = prereq.get("n") # prereq course number
            d = prereq.get("d") # prereq course department abbreviation

            # If this is a referential prereq, look up the last course
            # observed and hope it's the correct department prefix.
            try:
                if d in ["and", "or", ","]:
                    d = department_stack[-1]
                department_stack.append(d)
            except IndexError: # no previous courses
                continue

            log.debug("Searching for: %s %s" % (d, n))

            # Reference the prerequisite course identified by this
            # department abbreviation and course number.
            prereq_course = session.query(Course) \
                    .join(Department) \
                    .filter(Department.university==university) \
                    .filter(func.lower(Department.abbreviation)==d.lower()) \
                    .filter(Course.number==int(n)) \
                    .first()

            # If a valid course was found, tack it onto the prereq list of
            # the requesting course (course).
            if prereq_course and prereq_course is not course:
                course.prerequisites.append(prereq_course)
            else:
                log.debug("Failed to find course matching '%s %s'." % (d, n))

    log.info("Completed scraping.")

Пример #16

0

Показать файл

def scrape(args):
    """
    Scrape the available syllabi from the Stanford CS page into a local
    directory.
    """

    import logging
    log = logging.getLogger("root")
    log.info("Scraping Stanford CS data.")

    # Fetch existing metadata objects from database.
    university = META.get("school").get("name")
    university = session.query(University)\
            .filter(University.name==university)\
            .first()
    departments = {department.abbreviation.lower() : department
                    for department in session.query(Department)\
                        .filter(Department.university==university)\
                        .all()}

    # Static connection information
    catalog_url = "https://explorecourses.stanford.edu/search?q=CS&view=catalog&filter-departmentcode-CS=on&filter-term-Spring=on&filter-coursestatus-Active=on&page="
    catalog_page = 0
    catalog_page_limit = 8
    headers = {
        "user-agent":
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36\
            (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
    }

    # Loop forever until we get to the last page and can't "next >" any
    # more, in which case we stop.
    while True:

        # There are currently only 8 pages, so break after we see that
        # many.
        if catalog_page == catalog_page_limit:
            break

        # Generate a BeautifulSoup object.
        response = requests.get(catalog_url + str(catalog_page),
                                headers=headers)
        soup = BeautifulSoup(response.text)

        # Identify the course list.
        course_list = soup.find_all(class_="searchResult")

        # Identify relevant information for each course.
        for course in course_list:

            # Generate metadata
            title = course.find(class_="courseTitle").text
            identifier = re.compile("\s+").split(
                course.find(class_="courseNumber").text)
            prefix = identifier[0]
            cnum = identifier[1][:-1]
            description = course.find(class_="courseDescription").text

            log.debug(identifier)

            # Identify prerequisites or corequisites.
            # TODO: Match these up with their database entries.
            prereq_index = description.find("Prerequisite")
            if prereq_index > -1:
                prereq_string = description[prereq_index:]
                description = description[:prereq_index]

            # Clean the description string
            description_raw = description
            description = clean(description)
            if description is None:
                continue

            # Generate the appropriate course object.
            departments[prefix.lower()].courses.append(
                Course(number=cnum,
                       title=title,
                       description_raw=description_raw,
                       description=description))

        # Go to the next page.
        catalog_page = catalog_page + 1

    log.info("Completed scraping.")

Пример #17

0

Показать файл

Файл: utk.py Проект: lyericly/trajectory

def scrape(args):
    """
    Scrape the available syllabi from the UTK catalog into a local
    directory.
    """

    import logging

    log = logging.getLogger("root")
    log.info("Scraping UTK data.")

    # Constant values.
    catalog_index_url = "http://catalog.utk.edu/preview_course_incoming.php?prefix=%s&catoid=18"
    course_url = "http://catalog.utk.edu/preview_course.php?catoid=%s&coid=%s"

    # Regex to select only the catid and coid.
    catid_re = re.compile("(^[^']*)|(, this.*$)|(')")

    # Regex to isolate prerequisite course titles.
    prereq_re = re.compile("(\s|\\\\xa0)(\d{3})")

    # List of prefixes from the META object.
    prefixes = [department.get("abbreviation").lower() for department in META.get("departments")]

    # Fetch existing metadata objects from database.
    university = META.get("school").get("name")
    university = session.query(University).filter(University.name == university).first()
    departments = {
        department.abbreviation.lower(): department
        for department in session.query(Department).filter(Department.university == university).all()
    }

    prereq_dict = {}  # Dictionary of Course : Prereq match list
    for prefix in prefixes:
        if args.cs and prefix not in ["cosc"]:
            continue
        catalog_index = requests.get(catalog_index_url % prefix)
        soup = BeautifulSoup(catalog_index.text)

        # Identify the list of courses.
        course_list = soup.find_all(name="a", onclick=re.compile("showCourse.*"))

        # Identify relevant information for each course.
        for course in course_list:

            # Generate metadata
            log.debug(course.text)
            full_title = re.compile("\s+").split(course.text)
            cnum = full_title[1]
            title = " ".join(full_title[3:])

            # Identify coid to get description.
            onclick = course["onclick"]
            (catid, coid) = re.sub(catid_re, "", onclick).split(", ")

            # Generate a BeautifulSoup object of the course description.
            course_page = requests.get(course_url % (catid, coid))
            course_soup = BeautifulSoup(course_page.text)
            content = course_soup.find(class_="block_content_popup").hr

            # Clean out garbage
            [div.extract() for div in content.find_all("div")]
            extra = " ".join([em.extract().text for em in content.find_all("em")])
            extra.strip(" ")

            # Clean up the description
            description_raw = content.text.replace("\n", "").strip(" ")
            description = clean(description_raw)
            if description is None:
                continue

            # Identify prerequisites
            prereq_index = extra.find("requisite")
            prereq_list = None
            if prereq_index > -1:

                prereq_string = extra[prereq_index:]
                matches = prereq_re.findall(prereq_string)

                if len(matches) > 0:
                    # Split them up as a dict and store them in a list.
                    # Naiive assumption that every course number is within
                    # the same department because the department titles are
                    # written out and not matchable.
                    prereq_list = [{"d": prefix.lower(), "n": match[-1]} for match in matches]  # department  # number

            # Generate the appropriate course object.
            new_course = Course(number=cnum, title=title, description=description, description_raw=description_raw)
            departments[prefix.lower()].courses.append(new_course)

            # Add in the requested list of prereqs if found.
            if prereq_list is not None:
                prereq_dict[new_course] = prereq_list

    # Iterate over the list of courses, now that they've been created, and
    # process their list of requested prerequisites.
    for course, prereq_list in prereq_dict.items():

        # Skip any courses with a 0-length requested prereq list.
        if len(prereq_list) == 0:
            continue

        log.debug(course)
        log.debug(prereq_list)

        # Loop over set of prereqs, if there are multiple.
        department_stack = []
        for prereq in prereq_list:
            n = prereq.get("n")  # prereq course number
            d = prereq.get("d")  # prereq course department abbreviation

            log.debug("Searching for: %s %s" % (d, n))

            # Reference the prerequisite course identified by this
            # department abbreviation and course number.
            prereq_course = (
                session.query(Course)
                .join(Department)
                .filter(Department.university == university)
                .filter(func.lower(Department.abbreviation) == d.lower())
                .filter(Course.number == int(n))
                .first()
            )

            # If a valid course was found, tack it onto the prereq list of
            # the requesting course (course).
            if prereq_course and prereq_course is not course:
                course.prerequisites.append(prereq_course)
            else:
                log.debug("Failed to find course matching '%s %s'." % (d, n))

    log.info("Completed scraping.")

Python clean примеры использования