def get_faculty_specialisations(browser, faculty_links): links = [] try: links.extend(faculty_links["majors"]) except: pass try: links.extend(faculty_links["minors"]) except: pass try: links.extend(faculty_links["honours"]) except: pass total = len(links) for idx, link in enumerate(links): print(f"{idx + 1} / {total} >>> scraping {link}") code_in_link = re.search(REGEX_SPECIALISATION_CODE, link).group(0) if code_in_link in SPECIALISATIONS: continue browser.get(link) time.sleep(random.randint(15, 25)) html = BeautifulSoup(browser.page_source, "html.parser") specialisation_info = get_specialisation_info(html) SPECIALISATIONS[specialisation_info["code"]] = specialisation_info scrape.write_to_file("specialisations.json", SPECIALISATIONS)
def filter_course_links(filter_array): course_links = scrape.read_from_file("links_courses.json") filtered_links = [] for link in course_links: code = re.search(REGEX_COURSE_CODE, link).group(0) if code in filter_array: print(code) else: filtered_links.append(link) scrape.write_to_file("links_courses.json", filtered_links)
def fix_conditions(): COURSES = scrape.read_from_file("courses.json") for code in COURSES: raw = COURSES[code]["conditions"]["raw"] if raw == None: continue COURSES[code]["conditions"] = process_course_conditions(raw, code) scrape.write_to_file("courses_better.json", COURSES) # fix_conditions()
def get_programs(browser, faculty_links): if not faculty_links["programs"]: return None total = len(faculty_links["programs"]) for idx, link in enumerate(faculty_links["programs"]): print(f"{idx + 1} / {total} >>> scraping {link}") code_from_link = link.split("/")[-1] if code_from_link in PROGRAMS: continue browser.get(link) time.sleep(random.randint(3, 7)) html = BeautifulSoup(browser.page_source, "html.parser") program_info = get_program_info(html) if re.search("Co(-| )op", program_info["name"]): continue PROGRAMS[program_info["code"]] = program_info scrape.write_to_file("programs.json", PROGRAMS)
return None total = len(faculty_links["programs"]) for idx, link in enumerate(faculty_links["programs"]): print(f"{idx + 1} / {total} >>> scraping {link}") code_from_link = link.split("/")[-1] if code_from_link in PROGRAMS: continue browser.get(link) time.sleep(random.randint(3, 7)) html = BeautifulSoup(browser.page_source, "html.parser") program_info = get_program_info(html) if re.search("Co(-| )op", program_info["name"]): continue PROGRAMS[program_info["code"]] = program_info scrape.write_to_file("programs.json", PROGRAMS) browser = webdriver.Chrome(scrape.CHROME_DRIVER) for faculty in LINKS: get_programs(browser, LINKS[faculty]) # get_programs(browser, LINKS["Engineering"]) browser.quit() scrape.write_to_file("programs.json", PROGRAMS)
from bs4 import BeautifulSoup import scrape import json SUBJECT_AREAS = {"code_to_subject": {}, "subject_to_code": {}} html = scrape.get_html(scrape.HANDBOOK_URL) subject_area_tiles = html.find(id="tab_educational_area").find_all("h3") for tile in subject_area_tiles: code, subject = tile.text.split(": ") SUBJECT_AREAS["code_to_subject"][code] = subject SUBJECT_AREAS["subject_to_code"][subject] = code scrape.write_to_file("subject_areas.json", SUBJECT_AREAS)
pass total = len(links) for idx, link in enumerate(links): print(f"{idx + 1} / {total} >>> scraping {link}") code_in_link = re.search(REGEX_SPECIALISATION_CODE, link).group(0) if code_in_link in SPECIALISATIONS: continue browser.get(link) time.sleep(random.randint(15, 25)) html = BeautifulSoup(browser.page_source, "html.parser") specialisation_info = get_specialisation_info(html) SPECIALISATIONS[specialisation_info["code"]] = specialisation_info scrape.write_to_file("specialisations.json", SPECIALISATIONS) browser = webdriver.Chrome( scrape.CHROME_DRIVER) # NEED TO BE CHROME VERSION 85 for faculty in LINKS: get_faculty_specialisations(browser, LINKS[faculty]) # get_faculty_specialisations(browser, LINKS["Engineering"]) browser.quit() scrape.write_to_file("specialisations.json", SPECIALISATIONS)
browser.get(link) time.sleep(WAIT) print("\n", link.split("/")[-1]) return { "programs": get_section(browser, "Programs"), "double_degrees": get_section(browser, "Double Degrees"), "majors": get_section(browser, "Major"), "minors": get_section(browser, "Minor"), "honours": get_section(browser, "Honours"), } # Open browser browser = webdriver.Chrome( scrape.CHROME_DRIVER) # NEED TO BE CHROME VERSION 85 LINKS["ASS"] = get_faculty_degrees(browser, FACULTY_OF_ASS) LINKS["BE"] = get_faculty_degrees(browser, FACULTY_OF_BE) LINKS["AD"] = get_faculty_degrees(browser, FACULTY_OF_AD) LINKS["Engineering"] = get_faculty_degrees(browser, FACULTY_OF_ENGINEERING) LINKS["Law"] = get_faculty_degrees(browser, FACULTY_OF_LAW) LINKS["Medicine"] = get_faculty_degrees(browser, FACULTY_OF_MEDICINE) LINKS["Science"] = get_faculty_degrees(browser, FACULTY_OF_SCIENCE) LINKS["Business"] = get_faculty_degrees(browser, UNSW_BUSINESS_SCHOOL) scrape.write_to_file("links_degrees.json", LINKS) # Close browser browser.quit()
for idx, link in enumerate(course_links): code_from_link = re.search(REGEX_COURSE_CODE, link).group(0) if code_from_link in COURSES: print(f" ~~ skipped {code_from_link}") continue random_int = random.randint(15, 20) print(f"{idx + 1}/{total} >>> waiting {random_int} seconds >>> {link}") # Get html browser.get(link) time.sleep(random_int) course_html = BeautifulSoup(browser.page_source, "html.parser") try: course_info = get_course_info(course_html) COURSES[course_info["course_code"]] = course_info scrape.write_to_file("courses.json", COURSES) except: # note crash print(f"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@") print(f"@@@ @@@") print(f"@@@ crashed on {code_from_link} @@@") print(f"@@@ @@@") print(f"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@") continue # Update unlocks COURSES = update_unlocks(COURSES) scrape.write_to_file("courses.json", COURSES) browser.quit()