def get_faculty_specialisations(browser, faculty_links):
    links = []
    try:
        links.extend(faculty_links["majors"])
    except:
        pass

    try:
        links.extend(faculty_links["minors"])
    except:
        pass

    try:
        links.extend(faculty_links["honours"])
    except:
        pass

    total = len(links)

    for idx, link in enumerate(links):
        print(f"{idx + 1} / {total} >>> scraping {link}")

        code_in_link = re.search(REGEX_SPECIALISATION_CODE, link).group(0)
        if code_in_link in SPECIALISATIONS:
            continue

        browser.get(link)
        time.sleep(random.randint(15, 25))
        html = BeautifulSoup(browser.page_source, "html.parser")

        specialisation_info = get_specialisation_info(html)
        SPECIALISATIONS[specialisation_info["code"]] = specialisation_info
        scrape.write_to_file("specialisations.json", SPECIALISATIONS)
def filter_course_links(filter_array):
    course_links = scrape.read_from_file("links_courses.json")
    filtered_links = []

    for link in course_links:
        code = re.search(REGEX_COURSE_CODE, link).group(0)
        if code in filter_array:
            print(code)
        else:
            filtered_links.append(link)

    scrape.write_to_file("links_courses.json", filtered_links)
Пример #3
0
def fix_conditions():
    COURSES = scrape.read_from_file("courses.json")

    for code in COURSES:
        raw = COURSES[code]["conditions"]["raw"]
        if raw == None:
            continue

        COURSES[code]["conditions"] = process_course_conditions(raw, code)

    scrape.write_to_file("courses_better.json", COURSES)


# fix_conditions()
Пример #4
0
def get_programs(browser, faculty_links):
    if not faculty_links["programs"]:
        return None

    total = len(faculty_links["programs"])

    for idx, link in enumerate(faculty_links["programs"]):
        print(f"{idx + 1} / {total} >>> scraping {link}")
        code_from_link = link.split("/")[-1]
        if code_from_link in PROGRAMS:
            continue

        browser.get(link)
        time.sleep(random.randint(3, 7))
        html = BeautifulSoup(browser.page_source, "html.parser")

        program_info = get_program_info(html)
        if re.search("Co(-| )op", program_info["name"]):
            continue

        PROGRAMS[program_info["code"]] = program_info
        scrape.write_to_file("programs.json", PROGRAMS)
Пример #5
0
        return None

    total = len(faculty_links["programs"])

    for idx, link in enumerate(faculty_links["programs"]):
        print(f"{idx + 1} / {total} >>> scraping {link}")
        code_from_link = link.split("/")[-1]
        if code_from_link in PROGRAMS:
            continue

        browser.get(link)
        time.sleep(random.randint(3, 7))
        html = BeautifulSoup(browser.page_source, "html.parser")

        program_info = get_program_info(html)
        if re.search("Co(-| )op", program_info["name"]):
            continue

        PROGRAMS[program_info["code"]] = program_info
        scrape.write_to_file("programs.json", PROGRAMS)


browser = webdriver.Chrome(scrape.CHROME_DRIVER)

for faculty in LINKS:
    get_programs(browser, LINKS[faculty])
# get_programs(browser, LINKS["Engineering"])

browser.quit()
scrape.write_to_file("programs.json", PROGRAMS)
from bs4 import BeautifulSoup
import scrape
import json

SUBJECT_AREAS = {"code_to_subject": {}, "subject_to_code": {}}

html = scrape.get_html(scrape.HANDBOOK_URL)
subject_area_tiles = html.find(id="tab_educational_area").find_all("h3")

for tile in subject_area_tiles:
    code, subject = tile.text.split(": ")

    SUBJECT_AREAS["code_to_subject"][code] = subject
    SUBJECT_AREAS["subject_to_code"][subject] = code

scrape.write_to_file("subject_areas.json", SUBJECT_AREAS)
        pass

    total = len(links)

    for idx, link in enumerate(links):
        print(f"{idx + 1} / {total} >>> scraping {link}")

        code_in_link = re.search(REGEX_SPECIALISATION_CODE, link).group(0)
        if code_in_link in SPECIALISATIONS:
            continue

        browser.get(link)
        time.sleep(random.randint(15, 25))
        html = BeautifulSoup(browser.page_source, "html.parser")

        specialisation_info = get_specialisation_info(html)
        SPECIALISATIONS[specialisation_info["code"]] = specialisation_info
        scrape.write_to_file("specialisations.json", SPECIALISATIONS)


browser = webdriver.Chrome(
    scrape.CHROME_DRIVER)  # NEED TO BE CHROME VERSION 85

for faculty in LINKS:
    get_faculty_specialisations(browser, LINKS[faculty])

# get_faculty_specialisations(browser, LINKS["Engineering"])

browser.quit()
scrape.write_to_file("specialisations.json", SPECIALISATIONS)
Пример #8
0
    browser.get(link)
    time.sleep(WAIT)

    print("\n", link.split("/")[-1])

    return {
        "programs": get_section(browser, "Programs"),
        "double_degrees": get_section(browser, "Double Degrees"),
        "majors": get_section(browser, "Major"),
        "minors": get_section(browser, "Minor"),
        "honours": get_section(browser, "Honours"),
    }


# Open browser
browser = webdriver.Chrome(
    scrape.CHROME_DRIVER)  # NEED TO BE CHROME VERSION 85

LINKS["ASS"] = get_faculty_degrees(browser, FACULTY_OF_ASS)
LINKS["BE"] = get_faculty_degrees(browser, FACULTY_OF_BE)
LINKS["AD"] = get_faculty_degrees(browser, FACULTY_OF_AD)
LINKS["Engineering"] = get_faculty_degrees(browser, FACULTY_OF_ENGINEERING)
LINKS["Law"] = get_faculty_degrees(browser, FACULTY_OF_LAW)
LINKS["Medicine"] = get_faculty_degrees(browser, FACULTY_OF_MEDICINE)
LINKS["Science"] = get_faculty_degrees(browser, FACULTY_OF_SCIENCE)
LINKS["Business"] = get_faculty_degrees(browser, UNSW_BUSINESS_SCHOOL)

scrape.write_to_file("links_degrees.json", LINKS)

# Close browser
browser.quit()
Пример #9
0
for idx, link in enumerate(course_links):
    code_from_link = re.search(REGEX_COURSE_CODE, link).group(0)
    if code_from_link in COURSES:
        print(f" ~~ skipped {code_from_link}")
        continue

    random_int = random.randint(15, 20)
    print(f"{idx + 1}/{total} >>> waiting {random_int} seconds >>> {link}")

    # Get html
    browser.get(link)
    time.sleep(random_int)
    course_html = BeautifulSoup(browser.page_source, "html.parser")
    try:
        course_info = get_course_info(course_html)
        COURSES[course_info["course_code"]] = course_info
        scrape.write_to_file("courses.json", COURSES)
    except:
        # note crash
        print(f"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
        print(f"@@@                             @@@")
        print(f"@@@     crashed on {code_from_link}     @@@")
        print(f"@@@                             @@@")
        print(f"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
        continue

# Update unlocks
COURSES = update_unlocks(COURSES)

scrape.write_to_file("courses.json", COURSES)
browser.quit()