def filter_course_links(filter_array): course_links = scrape.read_from_file("links_courses.json") filtered_links = [] for link in course_links: code = re.search(REGEX_COURSE_CODE, link).group(0) if code in filter_array: print(code) else: filtered_links.append(link) scrape.write_to_file("links_courses.json", filtered_links)
def fix_conditions(): COURSES = scrape.read_from_file("courses.json") for code in COURSES: raw = COURSES[code]["conditions"]["raw"] if raw == None: continue COURSES[code]["conditions"] = process_course_conditions(raw, code) scrape.write_to_file("courses_better.json", COURSES) # fix_conditions()
import scrape import re COURSES = scrape.read_from_file("courses.json") FILTER_COURSE_CODES = scrape.read_from_file("filter_course_codes.json") REGEX_COURSE_CODE = "[A-Z]{4}\d{4}" for code in COURSES: terms = COURSES[code]["terms"] if terms == None: continue if "Summer Term" in terms: terms.remove("Summer Term") if "Term 1" in terms: terms.remove("Term 1") if "Term 2" in terms: terms.remove("Term 2") if "Term 3" in terms: terms.remove("Term 3") if terms == []: continue print(code, terms) # scrape.write_to_file("courses.json", COURSES)
def get_all_course_codes(): all_course_links = scrape.read_from_file("links_courses.json") return [re.search(REGEX_COURSE_CODE, x).group(0) for x in all_course_links]
from selenium import webdriver from bs4 import BeautifulSoup import random import scrape import json import time import re LINKS = scrape.read_from_file("links_degrees.json") PROGRAMS = scrape.read_from_file("programs.json") WAIT = 20 REGEX_COURSE_CODE = "[A-Z]{4}\d{4}" REGEX_SPECIALISATION_CODE = "^[A-Z]{5}[H\d]$" REGEX_PROGRAM_CODE = "\d{4}" @scrape.return_null_on_failure def get_program_name(html): return html.find( "h2", class_="css-1b7bj3d-Heading-ComponentHeading-Heading-css-css ezav15i5" ).text @scrape.return_null_on_failure def get_program_code(html): return html.find( "h5", class_="introDetails__sub_heading css-ciwu9x-Subheading-css ezav15i1"
from selenium import webdriver from bs4 import BeautifulSoup import random import scrape import json import time import re SUBJECT_AREAS = scrape.read_from_file("subject_areas.json") SPECIALISATIONS = scrape.read_from_file("specialisations.json") # SPECIALISATIONS = {} LINKS = scrape.read_from_file("links_degrees.json") WAIT = 20 REGEX_COURSE_CODE = "[A-Z]{4}\d{4}" REGEX_SPECIALISATION_CODE = "[A-Z]{5}[H\d]" @scrape.return_null_on_failure def get_degree_name(html): return html.find( "h2", class_="css-1b7bj3d-Heading-ComponentHeading-Heading-css-css ezav15i5" ).text @scrape.return_null_on_failure def get_degree_code(html): return html.find(