def scrape_past_papers(): driver = load_site(SITE) years_data = driver.find_elements_by_xpath( "//div[@class='field-item even']/ul/li/a") earliest = int(years_data[-1].get_attribute("textContent")) latest = int(years_data[0].get_attribute("textContent")) years = list(reversed(range(earliest, latest + 1))) links = {course[0]: {year: [] for year in years} for course in COURSES} year_urls = [year_data.get_attribute("href") for year_data in years_data] for year, year_url in zip(years, year_urls): driver.get(year_url) for course_tup in COURSES: for course in course_tup[1:]: link = get_link(driver, course) if link is not None: links[course_tup[0]][year] += [link] driver.quit() for course in COURSES: acronym = course[0] for year in years: urls = links[acronym][year] length = len(urls) for url, n in zip(urls, range(0, length)): mod = f"_{n}" if length > 1 else "" write_out(acronym, f"Past_papers/{acronym}-{year}{mod}", url) print("All done!")
def scrape_sft(): driver = load_site(SITE) notes = get_urls( driver, "//div[@id='content-primary']/strong/center/a[contains(@href,'.pdf')]") probs = get_urls( driver, "//div[@id='content-primary']/ul/li/b/a[contains(@href,'.pdf')]")[4:] driver.quit() write_out_list(COURSE, "n", enumerate(notes)) write_out_list(COURSE, "e", enumerate(probs, 1)) print("All done!")
def scrape_exam_rubrics(): driver = load_site(SITE) links = {} for course_tup in COURSES: for course in course_tup[1:]: link = get_link(driver, course) if link is not None: links[course_tup[0]] = link driver.quit() for course in COURSES: acronym = course[0] write_out(acronym, f"{acronym}-exam_rubric", links[acronym]) print("All done!")
def scrape_sm(): driver = load_site(SITE) raw = get_named_urls(driver, "//div[@class='main']/ul/li/a") driver.quit() lecture_notes = [] problem_sheets = [] lecture_number = 0 for name, link in raw: if "Example" in name: problem_number = "".join([a for a in name if a.isdigit()]) problem_sheets.append((problem_number, link)) else: lecture_notes.append((lecture_number, link)) lecture_number += 1 write_out_list(COURSE, "n", lecture_notes) write_out_list(COURSE, "e", problem_sheets) print("All done!")
def scrape_aqft(): driver = load_site(SITE_0) first = get_urls(driver, "//div[@id='content-primary']/ul/li/b/a") driver.get(SITE_1) second = get_named_urls( driver, "//body/table/tbody/tr/td/a[contains(@href,'.pdf')]") driver.quit() lecture_notes = first[:-4] problem_sheets = first[-4:] write_out_list(COURSE, "n", enumerate(lecture_notes, 1)) write_out_list(COURSE, "e", enumerate(problem_sheets, 1)) for name, url in second: if name[-4:] == ".pdf": name = name[:-4] write_out(COURSE, f"Example_sheets/Kai_Roehrig_solutions/{name}", url) print("All done!")